In [1]:
# import libraries
import pandas as pd
import dask.dataframe as dd
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import neighbors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# load client and invoice datasets
client_train = pd.read_csv('data/client_train.csv')
client_test = pd.read_csv('data/client_test.csv')

invoice_train = pd.read_parquet('data/invoice_train_compressed.csv.parquet')
invoice_test = pd.read_parquet('data/invoice_test_compressed.csv.parquet')

In [3]:
# merge client and invoice training and testing dataset
merged_df = pd.merge(invoice_train, client_train, on='client_id', how='inner')

merged_test = pd.merge(invoice_test, client_test, on='client_id', how='inner')

In [4]:
features = [
    'tarif_type', 'counter_number', 'counter_code', 'reading_remarque',
    'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
    'consommation_level_3', 'consommation_level_4', 'old_index', 'new_index',
    'months_number', 'disrict', 'client_catg', 'region'
]

In [5]:
from sklearn.model_selection import train_test_split

# Partion the features from the class to predict
df_X = merged_df[features]
df_y = merged_df['target'].astype(int)  # Convert 'target' to integer (1/0)  # Convert 'Fraud'/'Not Fraud' to 1/0

# Split the training data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=1)

print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))

Number of training instances:  3133724 
Number of test instances:  1343025


In [6]:
X_train.head()

Unnamed: 0,tarif_type,counter_number,counter_code,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,disrict,client_catg,region
1599848,40,4289195,5,6,1,139,0,0,0,5359,5498,4,60,11,101
668526,11,31660,413,9,1,800,117,0,0,43970,44887,4,62,11,309
3012854,10,924153,202,6,1,200,0,0,0,9200,9400,4,62,11,371
3550972,11,437741,203,6,1,370,0,0,0,24400,24770,4,69,11,107
3875595,11,443898,203,6,1,337,0,0,0,9433,9770,4,62,11,301


In [7]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_merged_test_scaled = scaler.transform(merged_test[features])

In [8]:
from sklearn.decomposition import PCA

# Apply PCA with a fixed number of components (e.g., 5)
pca = PCA(n_components=5, random_state=1)  # Reduce to 5 dimensions
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Number of PCA components: {X_train_pca.shape[1]}")

Number of PCA components: 5


In [20]:
from sklearn import neighbors
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

# Define an expanded parameter grid
param_grid = {
    'n_neighbors': range(1, 10),  # Expanding range of neighbors
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'weights': ['uniform', 'distance']  # Adding weight options for neighbors
}

knn = neighbors.KNeighborsClassifier(algorithm='ball_tree')

# Use StratifiedKFold to ensure balanced class distribution in cross-validation
stratified_kfold = StratifiedKFold(n_splits=3)

# Update RandomizedSearchCV with expanded parameter grid and stratified cross-validation
random_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=param_grid,
    n_iter=10,  # Increase the number of iterations for more coverage
    cv=stratified_kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Take a 1% sample of the training data for tuning
sample_size = int(0.01 * len(X_train_pca))
sample_indices = np.random.choice(len(X_train_pca), sample_size, replace=False)
X_sample = X_train_pca[sample_indices]
y_sample = y_train.iloc[sample_indices]

# Run RandomizedSearchCV on this sample
random_search.fit(X_sample, y_sample)

# Output the best parameters and the best score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation accuracy:", random_search.best_score_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.5s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.6s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.7s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   2.9s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   3.1s
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time=   3.2s
[CV] END ...metric=minkowski, n_neighbors=6, weights=uniform; total time=   3.7s
[CV] END ...metric=minkowski, n_neighbors=6, weights=uniform; total time=   3.8s
[CV] END ..metric=minkowski, n_neighbors=2, weights=distance; total time=   2.0s
[CV] END ..metric=minkowski, n_neighbors=2, weights=distance; total time=   2.1s
[CV] END ..metric=minkowski, n_neighbors=2, weights=distance; total time=   2.2s
[CV] END ...metric=minkowski, n_neighbors=6, wei

In [21]:
# train the KNN model with the optimized hyperparameters

best_params = random_search.best_params_

knn_model = neighbors.KNeighborsClassifier(
    n_neighbors=best_params['n_neighbors'],
    metric=best_params['metric'],
    algorithm='ball_tree' 
)

knn_model.fit(X_train_pca, y_train)

In [22]:
sample_size_train = int(0.01 * len(X_train_pca))
sample_indices_train = np.random.choice(len(X_train_pca), sample_size_train, replace=False)
X_train_sample = X_train_pca[sample_indices_train]
y_train_sample = y_train.iloc[sample_indices_train]

train_accuracy = knn_model.score(X_train_sample, y_train_sample)
print(f'kNN accuracy for 1% of training set: {train_accuracy:.4f}')

kNN accuracy for 1% of training set: 0.9296


In [23]:
# Evaluate on a 1% sample of the test data
sample_size_test = int(0.01 * len(X_test_pca))
sample_indices_test = np.random.choice(len(X_test_pca), sample_size_test, replace=False)
X_test_sample = X_test_pca[sample_indices_test]
y_test_sample = y_test.iloc[sample_indices_test]

# Calculate the accuracy on the 1% test sample
test_accuracy = knn_model.score(X_test_sample, y_test_sample)
print(f'kNN accuracy for 1% of test set: {test_accuracy:.4f}')

kNN accuracy for 1% of test set: 0.9240


In [None]:
# Select a 10% sample of the merged_test dataset
sample_size_test = int(0.1 * len(merged_test))
sample_indices_test = np.random.choice(len(merged_test), sample_size_test, replace=False)
X_merged_test_sample = merged_test.iloc[sample_indices_test]

# Select features from the sample and preprocess with scaling and PCA
X_merged_test_sample_features = X_merged_test_sample[features]
X_merged_test_sample_scaled = scaler.transform(X_merged_test_sample_features)  # Scale using the scaler fitted on the training data
X_merged_test_sample_pca = pca.transform(X_merged_test_sample_scaled)  # Apply PCA transformation

# Make predictions on the sampled processed data
merged_test_sample_predictions = knn_model.predict(X_merged_test_sample_pca)

# Add predictions to the sampled DataFrame
merged_test_sample = X_merged_test_sample.copy()
merged_test_sample['predictions'] = merged_test_sample_predictions

In [25]:
# Display the sampled DataFrame with predictions
print(merged_test_sample[['client_id', 'predictions']])

                 client_id  predictions
930032   test_Client_35062            0
95266    test_Client_12618            0
1234944  test_Client_43344            0
3879     test_Client_10102            0
360901   test_Client_19818            0
...                    ...          ...
515849   test_Client_23981            0
557900   test_Client_25107            0
1614835  test_Client_53445            0
1618225  test_Client_53541            0
1003735  test_Client_37105            0

[193973 rows x 2 columns]
