In [4]:
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import pickle
import warnings
warnings.filterwarnings("ignore")

In [5]:
version = "v1.0"
data_url = "../data/fraud_v1.csv"

In [6]:
import sys  
sys.path.insert(0, '../backend/src')

import data_preprocessing

In [7]:
from data_preprocessing import transform_data

In [8]:
import os
os.environ['MLFLOW_TRACKING_USERNAME']= "tarekbouzayani"
os.environ["MLFLOW_TRACKING_PASSWORD"] ="953d55f6731f17bbec5039cbe833f0c68c32a09e" 

In [9]:
#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/tarekbouzayani/DataScienceGInfo3.mlflow') #your mlfow tracking uri 

In [10]:
#read the data
df = pd.read_csv(data_url)

In [11]:
#cleaning and preprocessing
X_train,X_test,y_train,y_test = transform_data(df)

In [12]:
X_train.head(3)

Unnamed: 0,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,sec,age,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
78689,98.21,1,0.331882,26.529,-82.0916,26.755882,-82.439298,0.0,1.0,0.5,0.521739,0.338983,0.358025,0,0,0,0,0,1,0,0,0,0,0,0,0,0
76423,6.01,1,0.290309,33.6028,-81.9748,32.867139,-82.648669,0.0,1.0,0.433333,0.826087,1.0,0.777778,0,0,0,0,0,0,0,0,0,0,0,0,0,1
86945,2.62,1,0.457067,39.9347,-86.1633,38.937676,-85.222318,0.0,1.0,0.633333,0.73913,0.322034,0.222222,0,0,0,0,0,0,0,0,0,1,0,0,0,0


# Resolve imbalanced data problem:

**Before moving forward to Machine Learning modeling , we need to handle the problem of unbalanced data, this is very common with fraud data, there is always the issue of class imbalance where actual fraud cases are way fewer than normal cases and constitute only a very small part of the dataset.**

**For this purpose we will be using SMOTE(Synthetic Minority Oversampling Technique) which is a statistical technique for increasing the number of cases in your dataset in a balanced way. The component works by generating new instances from existing minority cases that you supply as input. This implementation of SMOTE does not change the number of majority cases.**

**The new instances are not just copies of existing minority cases. Instead, the algorithm takes samples of the feature space for each target class and its nearest neighbors. The algorithm then generates new examples that combine features of the target case with features of its neighbors. This approach increases the features available to each class and makes the samples more general.**

In [None]:
from IPython.display import Image
Image(filename="images/smote.png")

In [None]:
method= SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)

# 2. Machine Learning Modeling :

## 1. Logistic Regression :
Logistic Regression : It is used in classification use cases where we
want to predict a discrete target ( for example whether something is true
or false). It catches the relationship between the target variable ( class
column in our case ) and the independant features ( clump thickness ,
sizeUniformity .. ) by fitting our data with the Sigmoid function.

Image(filename="images/lg.png")

In [None]:
#!pip install mlflow

In [None]:
mlflow.set_experiment("my-experiment")

In [None]:
mlflow.sklearn.autolog(registered_model_name="LogisticRegression")

In [None]:
RUN_NAME = "LogisticRegression"
with mlflow.start_run(run_name=RUN_NAME):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    #model fitting and training
    model=LogisticRegression()
    model.fit(X_resampled,y_resampled)
    predicted=model.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)

## Model evaluation :
### Evaluation metrics:
* **Precision:** 
Precision is one indicator of the quality of the positive predictions
made by the model. **Of all the positive predictions I made, how many of them are truly positive?** \
Precision is defined as follows:

In [None]:
Image(filename="images/Precision.png")

**Remark:**
 * A true positive (TP) is an outcome where the model correctly
predicts the positive class. Similarly, a true negative (TN) is
an outcome where the model correctly predicts the negative class.
 * A false positive (FP) is an outcome where the model incorrectly
predicts the positive class. And a false negative (FN) is an
outcome where the model incorrectly predicts the negative class.

* **Recall:**
Recall, a commonly used performance metric for classification models,
is the fraction of positives that are correctly classified,**Of all the actual positive examples out there, how many of them did I correctly predict to be positive?**

In [None]:
Image(filename="images/Recall.png")

* **F1-score:** To evaluate model performance comprehensively, we should examine both precision and recall. The F1 score serves as a helpful metric that considers both of them.

In [None]:
Image(filename="images/F1_score.png")

* **Accuracy:**
    Accuracy is the number of correctly predicted data points out of all the data points.

In [None]:
Image(filename="images/accuracy.png")

**To evaluate fraud detection models, the concepts of recall and precision are very important.Recall  meaures how many fraud cases where trully detected while precision evaluates how good the model is at generating as fewer false alarms as possible. For fraud detection, we want to prioritize high recall to leave out as few fraud cases as possible while also having a relatively high precision because too many false alarms can also be a problem!**

In [None]:
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

**As we can see above the recall is around 75% while precision is just 6% which means there is a lot of false positives to handle.**

## 2. Random forest:

In [None]:
Image(filename="images/random.png")

In [None]:
# enable autologging
mlflow.sklearn.autolog(registered_model_name="Random_Forest")

In [None]:
RUN_NAME = "RandomForest"
with mlflow.start_run(run_name=RUN_NAME):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    model2 = RandomForestClassifier(random_state=5)
    model2.fit(X_resampled,y_resampled)
    predicted=model2.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)

In [None]:
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

**The above metrics show that although the Random Forest model has a slightly lower recall, it has much better precision!**

# Save the best model:

In [None]:
#Reading Pandas Dataframe from mlflow
df=mlflow.search_runs(filter_string="metrics.F1_score_test < 1")

In [None]:
run_id = df.loc[df['metrics.F1_score_test'].idxmax()]['run_id']

In [None]:
model = mlflow.sklearn.load_model("runs:/" + run_id + "/model")

In [None]:
with open('best_model.pkl','wb') as f:
  pickle.dump(model,f)

In [None]:
df