### Setting up the environment:

In [22]:
#1# Importing libraries: 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, classification_report, recall_score

import warnings
warnings.filterwarnings("ignore")

RSEED=42

#2# Visualizing the dataset:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#3# Importing the original dataset with dummies:
df_fraud = pd.read_csv('./data/df_fraud_dummy.csv')

#### Creating the filtered data frame: 

In [23]:
#1# JUST FOR THE GAS TYPE:

# ELEC= 0 and Gas =1:
column_to_check_gas = 'counter_type'
df_eletric = df_fraud[df_fraud[column_to_check_gas] == 0]
#df_gas.nunique()

In [24]:
#2# JUST FOR FRAUD IN GAS TYPE: 

# Fraud = 1 , Non-fraud = 0
column_to_check_fraud = 'target'
df_eletric_fraud = df_eletric[df_eletric[column_to_check_fraud] == 1]
#df_gas_fraud.nunique()

#### Preparing the data: 

In [25]:
#1# Preparing the data:
# The data is too big for the machine to handle. 
# Therefore, We'll keep only the most important features,according to what was discussed within the group.

df_eletric.drop(['ID', 'invoice_date', 'consommation_level_1',
       'consommation_level_2', 'consommation_level_3', 'consommation_level_4',
       "client_catg_12", "client_catg_51", 'region',
       'creation_date','counter_statue_1', 'counter_statue_2', 'counter_statue_3',
       'counter_statue_4', 'counter_statue_5', 'reading_remarque_7',
       'reading_remarque_8', 'reading_remarque_9','counter_number'],axis=1,inplace=True)

df_eletric.columns

Index(['tarif_type', 'counter_coefficient', 'months_number', 'counter_type',
       'target', 'sum_consu'],
      dtype='object')

In [26]:
#2# Splitting the data:

y = df_eletric['target']
X = df_eletric.drop('target', axis=1)

X_train , X_test , y_train , y_test = train_test_split(X,y, stratify=y, test_size=0.3, random_state=RSEED)

In [27]:
#2# Scaling the data for a better performance:
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [72]:
#3# Dealing with the imbalance: 
# Synthetic Data Generation (e.g., SMOTE): Create synthetic samples for the minority class.

smote = SMOTE(random_state=RSEED)
X_train_smt, y_train_smt = smote.fit_resample(X_train_norm, y_train)


#### Logistc Regression analysis: 

In [73]:
#1# Training the model:
log_mod = LogisticRegression(class_weight='balanced', penalty='l2', C=1.0, solver='liblinear', random_state=RSEED)
log_mod.fit(X_train_smt, y_train_smt)

In [74]:
#2# Making a prediction:

#A# Train Data:
y_pred_train_lr = log_mod.predict(X_train_smt)
classif_train_lr = classification_report(y_train_smt, y_pred_train_lr)

#B# Test Data:
y_pred_test_lr = log_mod.predict(X_test)	
classif_test_lr = classification_report(y_test, y_pred_test_lr)

In [75]:
#3#Printing evaluation: 
print(f"Test Logistic Classification: \n {classif_test_lr}")

print(f"Train Logistic Classification: \n {classif_train_lr}")

Test Logistic Classification: 
               precision    recall  f1-score   support

           0       0.93      0.05      0.09    200148
           1       0.07      0.95      0.14     16157

    accuracy                           0.12    216305
   macro avg       0.50      0.50      0.12    216305
weighted avg       0.86      0.12      0.10    216305

Train Logistic Classification: 
               precision    recall  f1-score   support

           0       0.52      0.80      0.63    467008
           1       0.57      0.27      0.36    467008

    accuracy                           0.53    934016
   macro avg       0.54      0.53      0.50    934016
weighted avg       0.54      0.53      0.50    934016



#### Random Forest analysis:

In [87]:
#1# Training the module:
rf_mod = RandomForestClassifier(n_estimators=100, random_state=RSEED, min_weight_fraction_leaf=0.3, max_features=4)
rf_mod.fit(X_train_smt, y_train_smt)

In [88]:
#2# Checking the Hyperparameters: 

params = rf_mod.get_params()

for param, value in params.items():
    print(f"{param}: {value}")


bootstrap: True
ccp_alpha: 0.0
class_weight: None
criterion: gini
max_depth: None
max_features: 4
max_leaf_nodes: None
max_samples: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.3
n_estimators: 100
n_jobs: None
oob_score: False
random_state: 42
verbose: 0
warm_start: False


In [89]:
#3# Making a prediction:

#A# Train Data:
y_pred_train_rf = rf_mod.predict(X_train_smt)
classif_train_rf = classification_report(y_train_smt, y_pred_train_rf)

#B# Test Data:
y_pred_test_rf = rf_mod.predict(X_test)
classif_test_rf = classification_report(y_test, y_pred_test_rf)

In [90]:
#3#Printing evaluation:
 
print(f"Train Logistic Classification: \n {classif_train_rf}")

print(f"Test Logistic Classification: \n {classif_test_rf}")

Train Logistic Classification: 
               precision    recall  f1-score   support

           0       0.52      0.72      0.60    467008
           1       0.54      0.33      0.41    467008

    accuracy                           0.52    934016
   macro avg       0.53      0.52      0.51    934016
weighted avg       0.53      0.52      0.51    934016

Test Logistic Classification: 
               precision    recall  f1-score   support

           0       0.92      0.07      0.14    200148
           1       0.07      0.92      0.14     16157

    accuracy                           0.14    216305
   macro avg       0.50      0.50      0.14    216305
weighted avg       0.86      0.14      0.14    216305

