In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.feature_selection import f_regression
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report

warnings.filterwarnings("ignore")
resampling_techniques = [RandomOverSampler, SMOTE, SMOTETomek, RandomUnderSampler]
models = [RandomForestClassifier(class_weight='balanced'), xgb.XGBClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier()]
df = pd.read_csv('original_data.csv')
df['CUSTOMER_AGE'] = df['CUSTOMER_AGE'].apply(lambda age: age * 12)
x = df.drop(['TARGET'], axis=1)
y = df['TARGET']
progress_bar = tqdm(total=224)
# Initialize results for storing the models, hyperparameters, and evaluation scores
results = []
for p in range(3, 17):
    select = SelectKBest(score_func=f_regression, k=p)
    z = select.fit_transform(x, y) 
    X_train, X_test, y_train, y_test = train_test_split(z, y, test_size=0.30, random_state=42, stratify=y)

    for resampling in resampling_techniques:
        resampler = resampling()
        X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

        for model in models:
            best_model = model
            best_model.fit(X_train_resampled, y_train_resampled)
            y_pred = best_model.predict(X_test)
            f1 = f1_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred).mean()
            precision = precision_score(y_test, y_pred, average='weighted')
            accuracy = accuracy_score(y_test, y_pred)
            progress_bar.update(1)

            # Calculate accuracy for positive and negative classes
            accuracy_positive = accuracy_score(y_test[y_test == 1],y_pred[y_test == 1])
            accuracy_negative = accuracy_score(y_test[y_test == 0],y_pred[y_test == 0])
            avg_accPN = (accuracy_positive+accuracy_negative) / 2
            confusion = confusion_matrix(y_test,y_pred)
            report = classification_report(y_test,y_pred)


            result = {
                'Undersampling Technique': resampling.__name__,
                'Model': type(best_model).__name__,
                'F1 Score': f1,
                'Recall Score': recall,
                'Precision Score': precision,
                'Accuracy': accuracy,
                'Accuracy (Positive)': accuracy_positive,
                'Accuracy (Negative)': accuracy_negative,
                'Accuracy (Positive)+(Negative) AVG': avg_accPN,
                'SelectKBest N features': p,
                'confusion_matrix': confusion,
                'classification_report': report
            }
            results.append(result)

progress_bar.close()
# Sort the results based on the average accuracy
results = sorted(results, key=lambda x: x['Accuracy (Positive)+(Negative) AVG'],reverse=True)

# Print the sorted results
for result in results:
    print("Undersampling Technique:", result['Undersampling Technique'])
    print("Model:", result['Model'])
    print("F1 Score:", result['F1 Score'])
    print("Recall Score:", result['Recall Score'])
    print("Precision Score:", result['Precision Score'])
    print("Accuracy:", result['Accuracy'])
    print("Accuracy (Positive):", result['Accuracy (Positive)'])
    print("Accuracy (Negative):", result['Accuracy (Negative)'])
    print("Accuracy (Positive)+(Negative) AVG:", result['Accuracy (Positive)+(Negative) AVG'])
    print("SelectKBest N features:", result['SelectKBest N features'])
    print("confusion_matrix", result['confusion_matrix'])
    print("classification_report", result['classification_report'])
    print('__________________________________')


100%|██████████| 224/224 [14:56<00:00,  4.00s/it]

Undersampling Technique: RandomOverSampler
Model: GradientBoostingClassifier
F1 Score: 0.896270968193134
Recall Score: 0.8589743589743589
Precision Score: 0.9936783387366669
Accuracy: 0.8206316087279414
Accuracy (Positive): 0.8589743589743589
Accuracy (Negative): 0.8204189040219053
Accuracy (Positive)+(Negative) AVG: 0.8396966314981321
SelectKBest N features: 14
confusion_matrix [[23071  5050]
 [   22   134]]
classification_report               precision    recall  f1-score   support

           0       1.00      0.82      0.90     28121
           1       0.03      0.86      0.05       156

    accuracy                           0.82     28277
   macro avg       0.51      0.84      0.48     28277
weighted avg       0.99      0.82      0.90     28277

__________________________________
Undersampling Technique: SMOTE
Model: GradientBoostingClassifier
F1 Score: 0.8977469554257976
Recall Score: 0.8525641025641025
Precision Score: 0.9936392116196578
Accuracy: 0.823071754429395
Accuracy (Po




Learning Rate: 0.1
Number of Estimators: 50
Max Depth: 5
Min Samples Split: 2
Min Samples Leaf: 10
Subsample: 0.8
Max Features: sqrt

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import joblib
import pickle

df = pd.read_csv('original_data')

x = df.drop(['TARGET'], axis=1)
y = df['TARGET']

select = SelectKBest(score_func=f_regression, k=3)
x_selected = select.fit_transform(x, y)

selected_indices = select.get_support(indices=True)

selected_columns = x.columns[selected_indices]

X_train, X_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.30, random_state=42, stratify=y)

resampler = SMOTETomek()
X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=5,
                                   min_samples_split=2, min_samples_leaf=10,
                                   subsample=0.8, max_features='sqrt')
selected_model = model.fit(X_train_resampled, y_train_resampled)

# Convert selected_columns to a DataFrame 
selected_columns_df = pd.DataFrame(selected_columns, columns=['Selected Columns'])

# Save the selected_columns DataFrame to a CSV file
selected_columns_df.to_csv('selected_columns.csv',header=False, index=False)
# Save the trained model
joblib.dump(selected_model, 'trained_model.pkl')

y_pred = selected_model.predict(X_test)

confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)
print("\nClassification Report:")
print(report)
print("\nSelected Columns:")
print(selected_columns)
accuracy_positive = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
accuracy_negative = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
avg_accPN = (accuracy_positive + accuracy_negative) / 2
print(avg_accPN)


Confusion Matrix:
[[23047  5074]
 [   24   132]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90     28121
           1       0.03      0.85      0.05       156

    accuracy                           0.82     28277
   macro avg       0.51      0.83      0.47     28277
weighted avg       0.99      0.82      0.90     28277


Selected Columns:
Index(['CUSTOMER_WITH_ORANGE_MONTHS', 'COMMITMENT_FG',
       'Disconnection_TOTAL_MIN_day'],
      dtype='object')
0.8328596477310961


In [8]:
import pickle
pickle.dump(model,open('churnmodel.pkl','wb'))

In [6]:
import tkinter as tk
from tkinter import filedialog
import pandas as pd
import joblib
import warnings

selected_columns = [] 
selected_model = None 
input_data = pd.DataFrame()  
predicted_data = pd.DataFrame()  
feature_names = []

warnings.filterwarnings("ignore")
# Function to load the model
def load_model():
    model_file = filedialog.askopenfilename(filetypes=[("Model File", "*.pkl")])
    if model_file:
        global selected_model, selected_columns
        selected_model = joblib.load(model_file)
        status_label.config(text="Model loaded successfully!")
        selected_columns = pd.read_csv('selected_columns.csv', header=None).values.flatten().tolist()

# Function to enter data manually
def enter_data_manually():
    global input_data, selected_columns
    input_data = pd.DataFrame(columns=selected_columns)

    def submit_data():
        values = []
        for entry in data_entries:
            values.append(entry.get())
        input_data.loc[len(input_data)] = values
        status_label.config(text="Data entered successfully!")
        clear_entries()

    def clear_entries():
        for entry in data_entries:
            entry.delete(0, tk.END)

    # Create a new window for entering data manually
    manual_entry_window = tk.Toplevel(window)
    manual_entry_window.title("Manual Data Entry")

    # Create data entry labels and entry fields for selected columns
    data_entries = []
    for i, column in enumerate(selected_columns):
        tk.Label(manual_entry_window, text=column).grid(row=i, column=0)
        entry = tk.Entry(manual_entry_window)
        entry.grid(row=i, column=1)
        data_entries.append(entry)

    # Create submit and clear buttons
    submit_button = tk.Button(manual_entry_window, text="Submit", command=submit_data)
    submit_button.grid(row=len(selected_columns), column=0, pady=10)
    clear_button = tk.Button(manual_entry_window, text="Clear", command=clear_entries)
    clear_button.grid(row=len(selected_columns), column=1, pady=10)


# Function to predict using the loaded model
def predict():
    global predicted_data
    try:
        input_data_selected = input_data[selected_columns]  # Select only the desired columns
        predicted_data = input_data.copy()
        predicted_data['Prediction'] = selected_model.predict(input_data_selected)
        status_label.config(text="Prediction complete!")
        prediction_label.config(text="Prediction: {}".format(predicted_data['Prediction'].values[0]))
    except Exception as e:
        status_label.config(text="Prediction error: {}".format(str(e)))
        prediction_label.config(text="")

# Function to export the predicted data
def export_data():
    export_file = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV File", "*.csv")])
    if export_file:
        predicted_data.to_csv(export_file, index=False)
        status_label.config(text="Data exported successfully!")

def import_data():
    data_file = filedialog.askopenfilename(filetypes=[("CSV Files", "*.csv")])
    if data_file:
        global input_data, selected_columns
        input_data = pd.read_csv(data_file)
        status_label.config(text="Data imported successfully!")

# Create the main window
window = tk.Tk()
window.title("Model Deployment")

# Create buttons and labels
load_model_button = tk.Button(window, text="Load Model", command=load_model)
load_model_button.pack()

import_data_button = tk.Button(window, text="Import Data", command=import_data)
import_data_button.pack()

enter_data_button = tk.Button(window, text="Enter Data Manually", command=enter_data_manually)
enter_data_button.pack()

predict_button = tk.Button(window, text="Predict", command=predict)
predict_button.pack()

export_data_button = tk.Button(window, text="Export Predicted Data", command=export_data)
export_data_button.pack()

status_label = tk.Label(window, text="")
status_label.pack()

prediction_label = tk.Label(window, text="")
prediction_label.pack()

# Run the main event loop
window.mainloop()


Hyperparameters

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingClassifier
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
import datetime
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
df = pd.read_csv('original_data')
from sklearn.feature_selection import SelectKBest
results = []
df = pd.read_csv('original_data')

x = df.drop(['TARGET'], axis=1)
y = df['TARGET']

best_model = None
best_avg_accPN = 0.0
progress_bar = tqdm(total=2916)
# Define the hyperparameters to search over
learning_rates = [0.1, 0.01, 0.001]
n_estimators = [50, 100, 200]
max_depths = [3, 5, 7]
min_samples_splits = [2, 5, 10]
min_samples_leafs = [1, 5, 10]
subsamples = [0.5, 0.8]
max_features = ['sqrt', 'log2', None]
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
for p in range(3, 5):
    select = SelectKBest(score_func=f_regression, k=p)
    z = select.fit_transform(x, y)
    X_train, X_test, y_train, y_test = train_test_split(z, y, test_size=0.30, random_state=42, stratify=y)

    resampler = SMOTETomek()
    X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

    for lr in learning_rates:
        for est in n_estimators:
            for depth in max_depths:
                for split in min_samples_splits:
                    for leaf in min_samples_leafs:
                        for subsample in subsamples:
                            for feature in max_features:
                                model = GradientBoostingClassifier(learning_rate=lr, n_estimators=est, max_depth=depth,
                                                                   min_samples_split=split, min_samples_leaf=leaf,
                                                                   subsample=subsample, max_features=feature)
                                selected_model = model.fit(X_train_resampled, y_train_resampled)
                                y_pred = selected_model.predict(X_test)
                                progress_bar.update(1)

                                accuracy_positive = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
                                accuracy_negative = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
                                avg_accPN = (accuracy_positive + accuracy_negative) / 2

                                f1 = f1_score(y_test, y_pred, average='weighted')
                                recall = recall_score(y_test, y_pred, average='weighted')
                                precision = precision_score(y_test, y_pred, average='weighted')
                                accuracy = accuracy_score(y_test, y_pred)
                                progress_bar.update(1)
                                # Calculate accuracy for positive and negative classes
                                confusion = confusion_matrix(y_test,y_pred)
                                report = classification_report(y_test,y_pred)
                                result = {
                                    'F1 Score': f1,
                                    'Recall Score': recall,
                                    'Precision Score': precision,
                                    'Accuracy': accuracy,
                                    'Accuracy (Positive)': accuracy_positive,
                                    'Accuracy (Negative)': accuracy_negative,
                                    'Accuracy (Positive)+(Negative) AVG': avg_accPN,
                                    'SelectKBest N features': p,
                                    'learning_rate': lr,
                                    'n_estimators': est,
                                    'max_depth': depth,
                                    'min_samples_split': split,
                                    'min_samples_leaf': leaf,
                                    'subsample': subsample,
                                    'max_features': feature,
                                    'confusion_matrix': confusion,
                                    'classification_report': report}
          
                                results.append(result)
                                # Save the results incrementally in a text file
                                filename = f'results_{timestamp}.txt'
                                with open(filename, 'a') as file:
                                    file.write(str(result))
                                    file.write('\n')
# Sort the results based on the average accuracy
results = sorted(results, key=lambda x: x['Accuracy (Positive)+(Negative) AVG'], reverse=True)
progress_bar.close()




  7%|▋         | 205/2916 [03:11<49:43,  1.10s/it]

KeyboardInterrupt: 

In [35]:

for result in results:
    print("F1 Score:", result['F1 Score'])
    print("Recall Score:", result['Recall Score'])
    print("Precision Score:", result['Precision Score'])
    print("Accuracy:", result['Accuracy'])
    print("Accuracy (Positive):", result['Accuracy (Positive)'])
    print("Accuracy (Negative):", result['Accuracy (Negative)'])
    print("Accuracy (Positive)+(Negative) AVG:", result['Accuracy (Positive)+(Negative) AVG'])
    print("SelectKBest N features:", result['SelectKBest N features'])
    print("Learning Rate:", result['learning_rate'])
    print("Number of Estimators:", result['n_estimators'])
    print("Max Depth:", result['max_depth'])
    print("Min Samples Split:", result['min_samples_split'])
    print("Min Samples Leaf:", result['min_samples_leaf'])
    print("Subsample:", result['subsample'])
    print("Max Features:", result['max_features'])
    print(result['confusion_matrix'])
    print(result['classification_report'])
    print('__________________________________')

    

F1 Score: 0.8986406043360136
Recall Score: 0.8245570605085405
Precision Score: 0.9936859910892953
Accuracy: 0.8245570605085405
Accuracy (Positive): 0.8589743589743589
Accuracy (Negative): 0.824366132072117
Accuracy (Positive)+(Negative) AVG: 0.8416702455232379
SelectKBest N features: 3
Learning Rate: 0.1
Number of Estimators: 50
Max Depth: 5
Min Samples Split: 2
Min Samples Leaf: 10
Subsample: 0.8
Max Features: sqrt
[[23182  4939]
 [   22   134]]
              precision    recall  f1-score   support

           0       1.00      0.82      0.90     28121
           1       0.03      0.86      0.05       156

    accuracy                           0.82     28277
   macro avg       0.51      0.84      0.48     28277
weighted avg       0.99      0.82      0.90     28277

__________________________________
F1 Score: 0.8983849084785792
Recall Score: 0.8241326873430703
Precision Score: 0.9936851593320437
Accuracy: 0.8241326873430703
Accuracy (Positive): 0.8589743589743589
Accuracy (Negative):