In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go 
import sklearn as sk



# Exploratory Data analysis and ML modeling

In [3]:
df = pd.read_csv('data/c_payment_fraud.csv', index_col=0)

In [4]:
df

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,label
0,29,1,4.745402,paypal,28.204861,shopping,0.0,0
1,725,1,4.742303,storecredit,0.000000,electronics,0.0,0
2,845,1,4.921318,creditcard,0.000000,food,1.0,0
3,503,1,4.886641,creditcard,0.000000,electronics,1.0,0
4,2000,1,5.040929,creditcard,0.000000,shopping,0.0,0
...,...,...,...,...,...,...,...,...
39216,986,1,4.836982,creditcard,0.000000,shopping,0.0,0
39217,1647,1,4.876771,creditcard,377.930556,shopping,0.0,0
39218,1591,1,4.742303,creditcard,0.000000,shopping,1.0,0
39219,237,1,4.921318,creditcard,236.082639,shopping,1.0,0


In [5]:
fig = px.histogram(df, x='accountAgeDays', color='label')
fig.show()

In [6]:
fig = px.histogram(df, x='paymentMethodAgeDays', color='label')
fig.show()

In [7]:
df_grouped = df.groupby(['paymentMethod', 'label']).agg(
    count=('accountAgeDays', 'count'),
    
).reset_index()

In [8]:
df_grouped

Unnamed: 0,paymentMethod,label,count
0,creditcard,0,27594
1,creditcard,1,410
2,paypal,0,9174
3,paypal,1,129
4,storecredit,0,1893
5,storecredit,1,21


In [9]:
fig = px.histogram(df_grouped, y='count', x='paymentMethod', color='label')
fig.show()

# Machine Learning Modeling

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39221 entries, 0 to 39220
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   accountAgeDays        39221 non-null  int64  
 1   numItems              39221 non-null  int64  
 2   localTime             39221 non-null  float64
 3   paymentMethod         39221 non-null  object 
 4   paymentMethodAgeDays  39221 non-null  float64
 5   Category              39221 non-null  object 
 6   isWeekend             39221 non-null  object 
 7   label                 39221 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 2.7+ MB


In [11]:
df_encoded = pd.get_dummies(df, dtype=int)

In [12]:
df_encoded

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit,Category_electronics,Category_food,Category_shopping,Category_unknow,isWeekend_0.0,isWeekend_1.0,isWeekend_undefined
0,29,1,4.745402,28.204861,0,0,1,0,0,0,1,0,1,0,0
1,725,1,4.742303,0.000000,0,0,0,1,1,0,0,0,1,0,0
2,845,1,4.921318,0.000000,0,1,0,0,0,1,0,0,0,1,0
3,503,1,4.886641,0.000000,0,1,0,0,1,0,0,0,0,1,0
4,2000,1,5.040929,0.000000,0,1,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39216,986,1,4.836982,0.000000,0,1,0,0,0,0,1,0,1,0,0
39217,1647,1,4.876771,377.930556,0,1,0,0,0,0,1,0,1,0,0
39218,1591,1,4.742303,0.000000,0,1,0,0,0,0,1,0,0,1,0
39219,237,1,4.921318,236.082639,0,1,0,0,0,0,1,0,0,1,0


In [13]:
pca = sk.decomposition.PCA()

In [14]:
X = df_encoded.drop(columns=['label'])
y = df_encoded['label']

In [15]:
X_pca = pd.DataFrame(pca.fit_transform(X))

In [16]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [17]:
y_train.unique()

array([0, 1])

In [18]:
y_test.unique()

array([0, 1])

In [19]:
rf = sk.ensemble.RandomForestClassifier(random_state=42)


In [20]:
model_trained = rf.fit(X_train, y_train)

In [21]:
y_pred = rf.predict(X_test)

In [22]:
sk.metrics.confusion_matrix(y_test, y_pred)

array([[7727,    0],
       [   0,  118]])

In [23]:
sk.metrics.recall_score(y_test, y_pred)

1.0

In [26]:
import joblib 
joblib.dump(model_trained, 'model.pkl')

['model.pkl']