In [None]:
import pandas as pd
from maquette import *

In [None]:
data = Dataset("fraud-analysis").get()

In [None]:
data.head(5)

In [None]:
# Create two dataframes with fraud and non-fraud data 
df_fraud = data.loc[data.fraud == 1] 
df_non_fraud = data.loc[data.fraud == 0]

In [None]:
# Count the fraudulent records
df_fraud.fraud.count()

In [None]:
# Find in which activities fraud occurs the most
data.groupby('category')['amount','fraud'].mean().sort_values(by=['amount'],ascending=False)

In [None]:
# Find in which ages, fraud occurs the most, mostly in younger ages <18
data.groupby('age')['amount','fraud'].mean().sort_values(by=['amount'],ascending=False)

In [None]:
#There is only one unique zipCode values so we drop them
data_reduced = data.drop(['zipcodeOri','zipMerchant'],axis=1)

In [None]:
data_reduced

In [None]:
data_reduced.columns

In [None]:
col_categorical = data_reduced.select_dtypes(include= ['object']).columns

In [None]:
col_categorical

In [None]:
# turning object columns type to categorical for easing the transformation process
for col in col_categorical:
    data_reduced[col] = data_reduced[col].astype('category')

In [None]:
#.cat.codes assigns numbers to categories
data_reduced[col_categorical] = data_reduced[col_categorical].apply(lambda x: x.cat.codes)

In [None]:
data_reduced

In [None]:
#Define our independent variable (X) and dependant/target variable y
X = data_reduced.drop(['fraud'],axis=1)
y = data['fraud']

In [None]:
X

In [None]:
y

In [None]:
#if a fixed value is assigned like random_state = 42 then no matter how many times you execute your code the result would be the same i.e, same values in train and test datasets.
import imblearn
#sm = imblearn.over_sampling.SMOTE(random_state=42)

In [None]:
sm

In [None]:
X_res, y_res = sm.fit_resample(X, y)
y_res = pd.DataFrame(y_res)
#print(y_res[0].value_counts())

In [None]:
y_res.count()

In [None]:
y_res.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.3,random_state=42,shuffle=True,stratify=y_res)

In [None]:
# %% K-Neighbours Classifier
import sklearn.neighbors
#from sklearn.model_selection import train_test_split
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,p=1)

knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)


print("Classification Report for K-Nearest Neighbours: \n", 
sklearn.metrics.classification_report(y_test, y_pred))
print("Confusion Matrix of K-Nearest Neigbours: \n", 
sklearn.metrics.confusion_matrix(y_test,y_pred))
#plot_roc_auc(y_test, knn.predict_proba(X_test)[:,1])

In [None]:
# %% Random Forest Classifier
import sklearn.ensemble

rf_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100,max_depth=8,random_state=42,
                                verbose=1,class_weight="balanced")

rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)

print("Classification Report for Random Forest Classifier: \n", sklearn.metrics.classification_report(y_test, y_pred))
print("Confusion Matrix of Random Forest Classifier: \n", sklearn.metrics.confusion_matrix(y_test,y_pred))
