In [33]:
# import libraries
import pandas as pd
import dask.dataframe as dd
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import neighbors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [34]:
# load client and invoice datasets
client_train = pd.read_csv('data/client_train.csv')
client_test = pd.read_csv('data/client_test.csv')

invoice_train = pd.read_parquet('data/invoice_train_compressed.csv.parquet')
invoice_test = pd.read_parquet('data/invoice_test_compressed.csv.parquet')

In [35]:
# merge client and invoice training and testing dataset
merged_df = pd.merge(invoice_train, client_train, on='client_id', how='inner')

merged_test = pd.merge(invoice_test, client_test, on='client_id', how='inner')

In [36]:
# Define the list of categorical columns
categorical_columns = ['disrict', 'client_catg', 'region', 'tarif_type', 
                       'counter_statue', 'reading_remarque', 'counter_type']

# Apply Label Encoding to each categorical feature
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col].astype(str))  # Convert to string in case there are mixed types
    label_encoders[col] = le  # Store the encoder for potential inverse_transform

# Verify the encoding
print(merged_df[categorical_columns].head())

   disrict  client_catg  region  tarif_type  counter_statue  reading_remarque  \
0        0            0       0           1               0                 6   
1        0            0       0           1               0                 4   
2        0            0       0           1               0                 6   
3        0            0       0           1               0                 6   
4        0            0       0           1               0                 7   

   counter_type  
0             0  
1             0  
2             0  
3             0  
4             0  


In [37]:
features = [
    'tarif_type', 'counter_number', 'counter_code', 'reading_remarque',
    'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
    'consommation_level_3', 'consommation_level_4', 'old_index', 'new_index',
    'months_number', 'disrict', 'client_catg', 'region', 'counter_statue', 'counter_type'
]

In [38]:
# Partion the features from the class to predict
df_X = merged_df[features]
df_y = merged_df['target'].astype(int)  # Convert 'target' to integer (1/0)  # Convert 'Fraud'/'Not Fraud' to 1/0

# Split the training data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=1)

print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))

Number of training instances:  3133724 
Number of test instances:  1343025


In [39]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [40]:
# Take a 5% sample of the training data for tuning
# Ensure you sample row indices, not column indices
sample_size = int(0.05 * len(X_train))  # 5% of the training data
sample_indices = np.random.choice(X_train.index, sample_size, replace=False)

# Use `.loc[]` for selecting rows based on indices
X_sample = X_train.loc[sample_indices]
y_sample = y_train.loc[sample_indices]


In [41]:
# Initialize the MultinomialNB model
model = MultinomialNB()

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9111029206455576
Confusion Matrix:
 [[1221053   16127]
 [ 103264    2581]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95   1237180
           1       0.14      0.02      0.04    105845

    accuracy                           0.91   1343025
   macro avg       0.53      0.51      0.50   1343025
weighted avg       0.86      0.91      0.88   1343025

