### Setting up the environment:

In [None]:
#1# Importing libraries: 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, classification_report, recall_score

import warnings
warnings.filterwarnings("ignore")

RSEED=42

#2# Visualizing the dataset:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#3# Importing the original dataset with dummies:
df_fraud = pd.read_csv('./data/df_fraud_dummy.csv')

#### Creating the filtered data frame: 

In [None]:
#1# JUST FOR THE GAS TYPE:

# ELEC= 0 and Gas =1:
column_to_check_gas = 'counter_type'
df_gas = df_fraud[df_fraud[column_to_check_gas] == 1]
#df_gas.nunique()

In [None]:
#2# JUST FOR FRAUD IN GAS TYPE: 

# Fraud = 1 , Non-fraud = 0
column_to_check_fraud = 'target'
df_gas_fraud = df_gas[df_gas[column_to_check_fraud] == 1]
#df_gas_fraud.nunique()

#### Preparing the data: 

In [None]:
#1# Preparing the data:
# The data is too big for the machine to handle. 
# Therefore, We'll keep only the most important features,according to what was discussed within the group.

df_gas.drop(['ID', 'invoice_date', 'consommation_level_1',
       'consommation_level_2', 'consommation_level_3', 'consommation_level_4',
       "client_catg_12", "client_catg_51", 'region',
       'creation_date','counter_statue_1', 'counter_statue_2', 'counter_statue_3',
       'counter_statue_4', 'counter_statue_5', 'reading_remarque_7',
       'reading_remarque_8', 'reading_remarque_9','counter_number'],axis=1,inplace=True)

df_gas.columns

In [None]:
#2# Splitting the data:

y = df_gas['target']
X = df_gas.drop('target', axis=1)

X_train , X_test , y_train , y_test = train_test_split(X,y, stratify=y, test_size=0.3, random_state=RSEED)

In [None]:
#2# Scaling the data for a better performance:
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

#### Logistc Regression analysis: 

In [None]:
#1# Dealing with the imbalance: 
# Synthetic Data Generation (e.g., SMOTE): Create synthetic samples for the minority class.

smote = SMOTE(random_state=RSEED)
X_train_smt, y_train_smt = smote.fit_resample(X_train_norm, y_train)


In [None]:
#2# Training the model:
log_mod = LogisticRegression(class_weight='balanced', penalty='l2', C=1.0, solver='liblinear', random_state=RSEED)
log_mod.fit(X_train_smt, y_train_smt)

In [None]:
#4# Making a prediction train:
y_pred_train = log_mod.predict(X_train_smt)
classif_train = classification_report(y_train_smt, y_pred_train)

#Printing: 
print(classif_train)

#5# Making a prediction test:
y_pred_test = log_mod.predict(X_test)	
classif_test = classification_report(y_test, y_pred_test)

#Printing: 
print(classif_test)

#### Random Forest:

In [None]:
#1# Training the model:
rf_mod = RandomForestClassifier(n_estimators=100, random_state=RSEED)
rf_mod.fit(X_train_smt, y_train_smt)

In [None]:
#3# Making a prediction:

#A# Train Data:
y_pred_train_rf = rf_mod.predict(X_train_smt)
classif_train_rf = classification_report(y_train_smt, y_pred_train_rf)

#B# Test Data:
y_pred_test_rf = rf_mod.predict(X_test)
classif_test_rf = classification_report(y_test, y_pred_test_rf)

In [None]:
#3#Printing evaluation:
 
print(f"Train Logistic Classification: \n {classif_train_rf}")

print(f"Test Logistic Classification: \n {classif_test_rf}")