Import all needed libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_selection import SelectKBest, f_regression


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

from tqdm import tqdm
import warnings
import pickle
import tkinter as tk
from tkinter import filedialog
import datetime
from sklearn.metrics import make_scorer


Picking the best model, resampling, number of features

In [10]:
warnings.filterwarnings("ignore")

# Initialize resampling techniques, models
resampling_techniques = [RandomOverSampler, SMOTE, SMOTETomek, RandomUnderSampler]
models = [RandomForestClassifier(class_weight='balanced'), xgb.XGBClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier()]

#Importing Data
df = pd.read_csv('original_data.csv')
x = df.drop(['TARGET'], axis=1)
y = df['TARGET']

# Initialize results for storing the models, hyperparameters, and evaluation scores
results_modelPicking = []


In [5]:
#for loop to try all combinations of the best techniques used for unbalanced data, 15-20 min approx run time

# Initialize based on number of iteration expected
progress_bar = tqdm(total=224)

#p is the number of features to pick
for p in range(3, 17):
    select = SelectKBest(score_func=f_regression, k=p)
    z = select.fit_transform(x, y) 
    X_train, X_test, y_train, y_test = train_test_split(z, y, test_size=0.30, random_state=42, stratify=y)
    #loop through resampling techniques
    for resampling in resampling_techniques:
        resampler = resampling()
        X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)
        #loop through picked models
        for model in models:
            best_model = model
            best_model.fit(X_train_resampled, y_train_resampled)
            y_pred = best_model.predict(X_test)
            
            #update the progress bar
            progress_bar.update(1)

            # Calculate metrics for our needs
            f1 = f1_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred).mean()
            precision = precision_score(y_test, y_pred, average='weighted')
            accuracy = accuracy_score(y_test, y_pred)


            # Calculate accuracy for positive and negative classes
            accuracy_positive = accuracy_score(y_test[y_test == 1],y_pred[y_test == 1])
            accuracy_negative = accuracy_score(y_test[y_test == 0],y_pred[y_test == 0])
            avg_accPN = (accuracy_positive+accuracy_negative) / 2
            confusion = confusion_matrix(y_test,y_pred)
            report = classification_report(y_test,y_pred)


            result = {
                'Undersampling Technique': resampling.__name__,
                'Model': type(best_model).__name__,
                'F1 Score': f1,
                'Recall Score': recall,
                'Precision Score': precision,
                'Accuracy': accuracy,
                'Accuracy (Positive)': accuracy_positive,
                'Accuracy (Negative)': accuracy_negative,
                'Accuracy (Positive)+(Negative) AVG': avg_accPN,
                'SelectKBest N features': p,
                'confusion_matrix': confusion,
                'classification_report': report
            }
            results_modelPicking.append(result)

progress_bar.close()

  0%|          | 0/224 [01:26<?, ?it/s]
100%|██████████| 224/224 [27:36<00:00,  7.39s/it]


In [7]:
# Sort the results based on the average Precision
results_modelPicking = sorted(results_modelPicking, key=lambda x: x['Accuracy (Positive)+(Negative) AVG'],reverse=True)

# Print the sorted results
for result in results_modelPicking:
    print("Undersampling Technique:", result['Undersampling Technique'])
    print("Model:", result['Model'])
    print("F1 Score:", result['F1 Score'])
    print("Recall Score:", result['Recall Score'])
    print("Precision Score:", result['Precision Score'])
    print("Accuracy:", result['Accuracy'])
    print("Accuracy (Positive):", result['Accuracy (Positive)'])
    print("Accuracy (Negative):", result['Accuracy (Negative)'])
    print("Accuracy (Positive)+(Negative) AVG:", result['Accuracy (Positive)+(Negative) AVG'])
    print("SelectKBest N features:", result['SelectKBest N features'])
    print("confusion_matrix", result['confusion_matrix'])
    print("classification_report", result['classification_report'])
    print('__________________________________')


Undersampling Technique: SMOTETomek
Model: GradientBoostingClassifier
F1 Score: 0.8931194190695171
Recall Score: 0.8525641025641025
Precision Score: 0.9936241005558673
Accuracy: 0.8154330374509319
Accuracy (Positive): 0.8525641025641025
Accuracy (Negative): 0.8152270545144198
Accuracy (Positive)+(Negative) AVG: 0.8338955785392612
SelectKBest N features: 4
confusion_matrix [[22925  5196]
 [   23   133]]
classification_report               precision    recall  f1-score   support

           0       1.00      0.82      0.90     28121
           1       0.02      0.85      0.05       156

    accuracy                           0.82     28277
   macro avg       0.51      0.83      0.47     28277
weighted avg       0.99      0.82      0.89     28277

__________________________________
Undersampling Technique: SMOTE
Model: GradientBoostingClassifier
F1 Score: 0.898965226972332
Recall Score: 0.8397435897435898
Precision Score: 0.9935557148088151
Accuracy: 0.8250875269653782
Accuracy (Positive)

We conclude that the best model is **GradientBoostingClassifier** 

The best regenrative solution and to avoid overfit we will use **GradientBoostingClassifier model**, and **SMOTETomek** resampler

**Hyperparameters selection**

we avoided gridsearch to take a deeper understanding with the best parameters we should use

In [11]:
#Initialization
results_ParamsPicking = []
best_model = None
best_avg_accPN = 0.0

#intialize the progress bar
progress_bar = tqdm(total=54)

# Define the hyperparameters to search over
learning_rates = [0.1]
n_estimators = [50]
max_depths = [5]
min_samples_splits = [2]
min_samples_leafs = [10]
subsamples = [0.5, 0.8]
max_features = ['sqrt', 'log2', None]

#timestamp for exporting text file
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

for p in range(6, 15):
    select = SelectKBest(score_func=f_regression, k=p)
    z = select.fit_transform(x, y)
    X_train, X_test, y_train, y_test = train_test_split(z, y, test_size=0.30, random_state=42, stratify=y)

    resampler = SMOTETomek()
    X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

    for lr in learning_rates:
        for est in n_estimators:
            for depth in max_depths:
                for split in min_samples_splits:
                    for leaf in min_samples_leafs:
                        for subsample in subsamples:
                            for feature in max_features:
                                model = GradientBoostingClassifier(learning_rate=lr, n_estimators=est, max_depth=depth,
                                                                   min_samples_split=split, min_samples_leaf=leaf,
                                                                   subsample=subsample, max_features=feature)
                                selected_model = model.fit(X_train_resampled, y_train_resampled)
                                y_pred = selected_model.predict(X_test)

                                #calculate results we needed for better understanding
                                accuracy_positive = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
                                accuracy_negative = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
                                avg_accPN = (accuracy_positive + accuracy_negative) / 2
                                f1 = f1_score(y_test, y_pred, average='weighted')
                                recall = recall_score(y_test, y_pred, average='weighted')
                                precision = precision_score(y_test, y_pred, average='weighted')
                                accuracy = accuracy_score(y_test, y_pred)

                                #update the progress bar
                                progress_bar.update(1)

                                # Calculate accuracy for positive and negative classes
                                confusion = confusion_matrix(y_test,y_pred)
                                report = classification_report(y_test,y_pred)

                                result = {
                                    'F1 Score': f1,
                                    'Recall Score': recall,
                                    'Precision Score': precision,
                                    'Accuracy': accuracy,
                                    'Accuracy (Positive)': accuracy_positive,
                                    'Accuracy (Negative)': accuracy_negative,
                                    'Accuracy (Positive)+(Negative) AVG': avg_accPN,
                                    'SelectKBest N features': p,
                                    'learning_rate': lr,
                                    'n_estimators': est,
                                    'max_depth': depth,
                                    'min_samples_split': split,
                                    'min_samples_leaf': leaf,
                                    'subsample': subsample,
                                    'max_features': feature,
                                    'confusion_matrix': confusion,
                                    'classification_report': report}
                                results_ParamsPicking.append(result)

                                # Save the results incrementally in a text file
                                filename = f'resultsresults_ParamsPicking_{timestamp}.txt'
                                with open(filename, 'a') as file:
                                    file.write(str(result))
                                    file.write('\n')
                                    file.write('___________________________')

progress_bar.close()

  0%|          | 0/54 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [5]:
# Sort the results based on the average precision, or change to any metric we want

results_ParamsPicking = sorted(results_ParamsPicking, key=lambda x: x['Accuracy (Positive)+(Negative) AVG'], reverse=True)

for result in results_ParamsPicking:
    print("F1 Score:", result['F1 Score'])
    print("Recall Score:", result['Recall Score'])
    print("Precision Score:", result['Precision Score'])
    print("Accuracy:", result['Accuracy'])
    print("Accuracy (Positive):", result['Accuracy (Positive)'])
    print("Accuracy (Negative):", result['Accuracy (Negative)'])
    print("Accuracy (Positive)+(Negative) AVG:", result['Accuracy (Positive)+(Negative) AVG'])
    print("SelectKBest N features:", result['SelectKBest N features'])
    print("Learning Rate:", result['learning_rate'])
    print("Number of Estimators:", result['n_estimators'])
    print("Max Depth:", result['max_depth'])
    print("Min Samples Split:", result['min_samples_split'])
    print("Min Samples Leaf:", result['min_samples_leaf'])
    print("Subsample:", result['subsample'])
    print("Max Features:", result['max_features'])
    print(result['confusion_matrix'])
    print(result['classification_report'])
    print('__________________________________')

F1 Score: 0.8987736090941361
Recall Score: 0.8247692470912756
Precision Score: 0.9935550454264443
Accuracy: 0.8247692470912756
Accuracy (Positive): 0.8397435897435898
Accuracy (Negative): 0.8246861775897016
Accuracy (Positive)+(Negative) AVG: 0.8322148836666456
SelectKBest N features: 7
Learning Rate: 0.1
Number of Estimators: 50
Max Depth: 5
Min Samples Split: 2
Min Samples Leaf: 10
Subsample: 0.8
Max Features: sqrt
[[23191  4930]
 [   25   131]]
              precision    recall  f1-score   support

           0       1.00      0.82      0.90     28121
           1       0.03      0.84      0.05       156

    accuracy                           0.82     28277
   macro avg       0.51      0.83      0.48     28277
weighted avg       0.99      0.82      0.90     28277

__________________________________
F1 Score: 0.8965759823511434
Recall Score: 0.8211267107543233
Precision Score: 0.9935474248469999
Accuracy: 0.8211267107543233
Accuracy (Positive): 0.8397435897435898
Accuracy (Negative)

Based on the above researching this code will run the model with SelectKBest for 7 features, SMOTETomek resampling technique, and with these Parameters:  
learning_rate=0.1  
n_estimators=50  
max_depth=5  
min_samples_split=2  
min_samples_leaf=10  
subsample=0.8  
max_features='sqrt'

In [12]:
#import Data

df = pd.read_csv('original_data.csv')
x = df.drop(['TARGET'], axis=1)
y = df['TARGET']

#select Best features by SelectKBest
select = SelectKBest(score_func=f_regression, k=7)
x_selected = select.fit_transform(x, y)

#get the names of features for later deployment
selected_indices = select.get_support(indices=True)
selected_columns = x.columns[selected_indices]

#split data to test/train samples
X_train, X_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.30, random_state=42, stratify=y)

#SMOTETomek resampling
resampler = SMOTETomek()
X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

#GradientBoostingClassifier model implemntation
model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=5,
                                   min_samples_split=2, min_samples_leaf=10,
                                   subsample=0.8, max_features='sqrt')
selected_model = model.fit(X_train_resampled, y_train_resampled)

# Convert selected_columns to a DataFrame
selected_columns_df = pd.DataFrame(selected_columns, columns=['Selected Columns'])

# Save the selected_columns DataFrame to a CSV file later deployment
selected_columns_df.to_csv('selected_columns.csv',header=False, index=False)

# Save the trained model later deployment
pickle.dump(selected_model, open('trained_model.pkl','wb'))

y_pred = selected_model.predict(X_test)

#metrics calculations
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)
print("\nClassification Report:")
print(report)
print("\nSelected Columns:")
print(selected_columns)
accuracy_positive = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
accuracy_negative = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
avg_accPN = (accuracy_positive + accuracy_negative) / 2
print("\naccuracy_positive:",accuracy_positive)
print("\naccuracy_negative:",accuracy_negative)
print("\naccuracy_Avg:",avg_accPN)


Confusion Matrix:
[[23190  4931]
 [   27   129]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90     28121
           1       0.03      0.83      0.05       156

    accuracy                           0.82     28277
   macro avg       0.51      0.83      0.48     28277
weighted avg       0.99      0.82      0.90     28277


Selected Columns:
Index(['CUSTOMER_WITH_ORANGE_MONTHS', 'COMMITMENT', 'COMMITMENT_FG',
       'MIGRATION_FLAG', 'Disconnection_TOTAL_MIN_day', 'NORTH_JORDAN',
       'West_Amman'],
      dtype='object')

accuracy_positive: 0.8269230769230769

accuracy_negative: 0.8246506169766367

accuracy_Avg: 0.8257868469498568


Small deployment for best model and parameters we picked

In [60]:
selected_columns = [] 
selected_model = None 
input_data = pd.DataFrame()  
predicted_data = pd.DataFrame()  
feature_names = []

warnings.filterwarnings("ignore")

#Function to load the model
def load_model():
    model_file = filedialog.askopenfilename(filetypes=[("Model File", "*.pkl")])
    if model_file:
        global selected_model, selected_columns
        selected_model = joblib.load(model_file)
        status_label.config(text="Model loaded successfully!")
        #import selected features from csv file
        selected_columns = pd.read_csv('selected_columns.csv', header=None).values.flatten().tolist()

#Function to enter data manually and its 2nd window
def enter_data_manually():
    global input_data, selected_columns
    input_data = pd.DataFrame(columns=selected_columns)

    def submit_data():
        values = []
        for entry in data_entries:
            values.append(entry.get())
        input_data.loc[len(input_data)] = values
        status_label.config(text="Data entered successfully!")
        clear_entries()
    #Define finction to clear data
    def clear_entries():
        for entry in data_entries:
            entry.delete(0, tk.END)

    # Create a new window for entering data manually
    manual_entry_window = tk.Toplevel(window)
    manual_entry_window.title("Manual Data Entry")

    # Create data entry labels and entry fields for selected columns
    data_entries = []
    for i, column in enumerate(selected_columns):
        tk.Label(manual_entry_window, text=column).grid(row=i, column=0)
        entry = tk.Entry(manual_entry_window)
        entry.grid(row=i, column=1)
        data_entries.append(entry)

    # Create submit and clear buttons
    submit_button = tk.Button(manual_entry_window, text="Submit", command=submit_data)
    submit_button.grid(row=len(selected_columns), column=0, pady=10)
    clear_button = tk.Button(manual_entry_window, text="Clear", command=clear_entries)
    clear_button.grid(row=len(selected_columns), column=1, pady=10)


# Function to predict using the loaded model
def predict():
    global predicted_data
    try:
        input_data_selected = input_data[selected_columns]  # Select only the desired columns
        predicted_data = input_data.copy()
        predicted_data['Prediction'] = selected_model.predict(input_data_selected)
        status_label.config(text="Prediction complete!")
        prediction_label.config(text="Prediction: {}".format(predicted_data['Prediction'].values[0]))
    except Exception as e:
        status_label.config(text="Prediction error: {}".format(str(e)))
        prediction_label.config(text="")

# Function to export the predicted data
def export_data():
    export_file = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV File", "*.csv")])
    if export_file:
        predicted_data.to_csv(export_file, index=False)
        status_label.config(text="Data exported successfully!")

def import_data():
    data_file = filedialog.askopenfilename(filetypes=[("CSV Files", "*.csv")])
    if data_file:
        global input_data, selected_columns
        input_data = pd.read_csv(data_file)
        status_label.config(text="Data imported successfully!")

# Create the main window
window = tk.Tk()
window.title("Model Deployment")

# Create buttons and labels
load_model_button = tk.Button(window, text="Load Model", command=load_model)
load_model_button.pack()

import_data_button = tk.Button(window, text="Import Data", command=import_data)
import_data_button.pack()

enter_data_button = tk.Button(window, text="Enter Data Manually", command=enter_data_manually)
enter_data_button.pack()

predict_button = tk.Button(window, text="Predict", command=predict)
predict_button.pack()

export_data_button = tk.Button(window, text="Export Predicted Data", command=export_data)
export_data_button.pack()

status_label = tk.Label(window, text="")
status_label.pack()

prediction_label = tk.Label(window, text="")
prediction_label.pack()

# Run the main event loop
window.mainloop()