In [1]:
import pandas as pd
import plotly.express as px
import category_encoders as ce

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv('bs140513_032310.csv')

In [4]:
data

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
...,...,...,...,...,...,...,...,...,...,...
594638,179,'C1753498738','3','F','28007','M1823072687','28007','es_transportation',20.53,0
594639,179,'C650108285','4','F','28007','M1823072687','28007','es_transportation',50.73,0
594640,179,'C123623130','2','F','28007','M349281107','28007','es_fashion',22.44,0
594641,179,'C1499363341','5','M','28007','M1823072687','28007','es_transportation',14.46,0


### Preprocessing the data

In [None]:
data[data['customer']==data['merchant']]

In [7]:
data['zipcodeOri'].value_counts()

'28007'    594643
Name: zipcodeOri, dtype: int64

In [8]:
data = data.drop(columns=['customer','gender','zipcodeOri','merchant','zipMerchant'])

In [9]:
data.category.unique()

array(["'es_transportation'", "'es_health'", "'es_otherservices'",
       "'es_food'", "'es_hotelservices'", "'es_barsandrestaurants'",
       "'es_tech'", "'es_sportsandtoys'", "'es_wellnessandbeauty'",
       "'es_hyper'", "'es_fashion'", "'es_home'", "'es_contents'",
       "'es_travel'", "'es_leisure'"], dtype=object)

#### Only keeping the categories with more than 1000 values to shorten the dataset

In [10]:
cat = data['category'].value_counts()>1000

In [11]:
cat = cat.reset_index()
cat = cat[cat['category']==True]

In [12]:
data = data[data.category.isin(cat['index'])]

In [13]:
data['category']

0         'es_transportation'
1         'es_transportation'
2         'es_transportation'
3         'es_transportation'
4         'es_transportation'
                 ...         
594638    'es_transportation'
594639    'es_transportation'
594640           'es_fashion'
594641    'es_transportation'
594642    'es_transportation'
Name: category, Length: 591619, dtype: object

#### Keeping only integer values in 'age'

In [14]:
data['age'] = data['age'].str.strip("''")
data = data[data['age']!='U']

In [15]:
data['age'] = data['age'].astype('int')

#### Encoding categorical variables

In [16]:
data = pd.get_dummies(data)

In [17]:
data.describe()

Unnamed: 0,step,age,amount,fraud,category_'es_barsandrestaurants',category_'es_fashion',category_'es_food',category_'es_health',category_'es_home',category_'es_hotelservices',category_'es_hyper',category_'es_sportsandtoys',category_'es_tech',category_'es_transportation',category_'es_wellnessandbeauty'
count,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0,590448.0
mean,95.0306,3.006536,34.792585,0.010018,0.01077,0.01091,0.044356,0.027245,0.00336,0.00294,0.010304,0.006771,0.004002,0.853833,0.025508
std,51.042375,1.32546,50.236267,0.099587,0.103217,0.103881,0.205885,0.162798,0.057869,0.054143,0.100985,0.082008,0.063135,0.353274,0.157661
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,52.0,2.0,13.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,97.0,3.0,26.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,139.0,4.0,42.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,179.0,6.0,1972.81,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
data.isnull().sum()

step                                0
age                                 0
amount                              0
fraud                               0
category_'es_barsandrestaurants'    0
category_'es_fashion'               0
category_'es_food'                  0
category_'es_health'                0
category_'es_home'                  0
category_'es_hotelservices'         0
category_'es_hyper'                 0
category_'es_sportsandtoys'         0
category_'es_tech'                  0
category_'es_transportation'        0
category_'es_wellnessandbeauty'     0
dtype: int64

In [19]:
data.fraud.value_counts()

0    584533
1      5915
Name: fraud, dtype: int64

### Model Building

In [20]:
x = data.drop(columns=['fraud'])
y = data['fraud']

#### Splitting the dataset

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [22]:
data

Unnamed: 0,step,age,amount,fraud,category_'es_barsandrestaurants',category_'es_fashion',category_'es_food',category_'es_health',category_'es_home',category_'es_hotelservices',category_'es_hyper',category_'es_sportsandtoys',category_'es_tech',category_'es_transportation',category_'es_wellnessandbeauty'
0,0,4,4.55,0,0,0,0,0,0,0,0,0,0,1,0
1,0,2,39.68,0,0,0,0,0,0,0,0,0,0,1,0
2,0,4,26.89,0,0,0,0,0,0,0,0,0,0,1,0
3,0,3,17.25,0,0,0,0,0,0,0,0,0,0,1,0
4,0,5,35.72,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594638,179,3,20.53,0,0,0,0,0,0,0,0,0,0,1,0
594639,179,4,50.73,0,0,0,0,0,0,0,0,0,0,1,0
594640,179,2,22.44,0,0,1,0,0,0,0,0,0,0,0,0
594641,179,5,14.46,0,0,0,0,0,0,0,0,0,0,1,0


#### Scaling the values

In [23]:
scal = StandardScaler()
x_train['amount'] = scal.fit_transform(x_train['amount'].values.reshape(-1,1))
x_test['amount'] = scal.fit_transform(x_test['amount'].values.reshape(-1,1))

In [24]:
x_train

Unnamed: 0,step,age,amount,category_'es_barsandrestaurants',category_'es_fashion',category_'es_food',category_'es_health',category_'es_home',category_'es_hotelservices',category_'es_hyper',category_'es_sportsandtoys',category_'es_tech',category_'es_transportation',category_'es_wellnessandbeauty'
22253,8,2,-0.155418,0,0,0,0,0,0,0,0,0,1,0
439541,138,5,0.152469,0,0,0,0,0,0,0,0,0,1,0
477790,148,6,-0.539975,0,0,0,0,0,0,1,0,0,0,0
14151,5,3,-0.176693,0,0,0,0,0,0,0,0,0,1,0
307144,100,4,-0.057873,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542796,166,3,-0.566870,0,0,0,0,0,0,0,0,0,1,0
168094,58,3,0.078207,0,0,0,0,0,0,0,0,0,1,0
489895,151,2,0.371242,0,0,1,0,0,0,0,0,0,0,0
237177,80,1,0.117144,0,0,0,0,0,0,0,0,0,1,0


#### Resampling the training data to ensure balance of positive and negative values

In [25]:
from imblearn.over_sampling import SMOTE

In [26]:
x_train, y_train = SMOTE().fit_resample(x_train, y_train)

In [None]:
y_train.shape

### Model training

In [None]:
random_tree = RandomForestClassifier(n_estimators = 100)
random_tree.fit(x_train,y_train)

In [32]:
y_predictions_rf = random_tree.predict(x_test)

In [None]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(x_train,y_train)

In [35]:
y_predictions_dt = dec_tree.predict(x_test)

### Model evaluation

In [49]:
metrices = {'Accuracy': [accuracy_score(y_test, y_predictions_rf)]
            ,'Precision':[precision_score(y_test, y_predictions_rf)]
            ,'Recall':[recall_score(y_test, y_predictions_rf)]
            ,'F1_score':[f1_score(y_test, y_predictions_rf)]}
rf_scores = pd.DataFrame(data=metrices)
rf_scores

Unnamed: 0,Accuracy,Precision,Recall,F1_score
0,0.987411,0.429625,0.748185,0.545825


In [50]:
metrices = {'Accuracy': [accuracy_score(y_test, y_predictions_dt)]
            ,'Precision':[precision_score(y_test, y_predictions_dt)]
            ,'Recall':[recall_score(y_test, y_predictions_dt)]
            ,'F1_score':[f1_score(y_test, y_predictions_dt)]}
dt_scores = pd.DataFrame(data=metrices)
dt_scores

Unnamed: 0,Accuracy,Precision,Recall,F1_score
0,0.987456,0.426576,0.699051,0.529835
