In [174]:
import pandas as pd
import numpy as np
from dython.nominal import associations
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate

___
## Data Preparation
1. Data is filtered to only include 2014 and higher data
2. Columns are split into categorical and numerical columns logically, for further proessing and training
3. Since we are trying to predict the deposit_lost, we remove vote shares and margin shares since having these would make the problem redundant.

In [175]:
data_path = "../assets/data/cleaned_data.csv"
df = pd.read_csv(data_path)
df = df[df["Year"] > 2014]
df.drop("Unnamed: 0", axis=1, inplace=True)

In [176]:
categorical_columns = ['Sex', 'Party', 'Candidate_Type', 'Constituency_Type', 'Party_Type_TCPD', 'Same_Constituency','Same_Party',
                       'Turncoat', 'Incumbent', 'Recontest', 'MyNeta_education',
                       'TCPD_Prof_Main', 'Deposit_Lost', 'last_poll']

numerical_columns = ['Electors', 'N_Cand', 'Turnout_Percentage',
                     'Vote_Share_Percentage', 'Margin_Percentage', 'ENOP','No_Terms', 'Contested']
df = df[categorical_columns + numerical_columns]

In [177]:
# Excluding the voting data since that would make the classifier redundant
df = df.drop(columns=['Vote_Share_Percentage', 'Margin_Percentage'])
numerical_columns = ['Electors', 'N_Cand', 'Turnout_Percentage', 'ENOP','No_Terms', 'Contested']

___
## Data Preprocessing
1. The categorical columns are numerically encoded so that they can further be passed into a classifier
2. The numerical columns are normalized so that differences in absolute numbers wont affect the classification problem.
3. Traning, Testing data are prepared.

In [180]:
label_encoder = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(label_encoder.fit_transform)

scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

X = df.drop(columns=['Deposit_Lost'])
y = df['Deposit_Lost']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
warnings.filterwarnings("ignore")

___
## Model Fitting

1. Various classification algorithms were chosen from the scikit learn library
2. Models were trained using all these algorithms, and the necessary metrics are printed for evaluation

In [181]:
results = []

models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(random_state=42))
]

for model_name, model in models:

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_predict(model, X_train, y_train, cv=kfold)
    cv_accuracy = accuracy_score(y_train, cv_scores)
    
    confusion_str = f"TN: {confusion_mat[0][0]}  FP: {confusion_mat[0][1]}\n" \
                    f"FN: {confusion_mat[1][0]}  TP: {confusion_mat[1][1]}"
    
    results.append([model_name, accuracy, precision, recall, f1, cv_accuracy, confusion_str])

headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score", "CV Accuracy", "Confusion Matrix"]
print(tabulate(results, headers=headers, tablefmt="grid"))


+---------------------+------------+-------------+----------+------------+---------------+--------------------+
| Model               |   Accuracy |   Precision |   Recall |   F1 Score |   CV Accuracy | Confusion Matrix   |
| Random Forest       |   0.967242 |    0.979972 | 0.982007 |   0.980989 |      0.96202  | TN: 205  FP: 29    |
|                     |            |             |          |            |               | FN: 26  TP: 1419   |
+---------------------+------------+-------------+----------+------------+---------------+--------------------+
| Gradient Boosting   |   0.96486  |    0.980583 | 0.978547 |   0.979564 |      0.96202  | TN: 206  FP: 28    |
|                     |            |             |          |            |               | FN: 31  TP: 1414   |
+---------------------+------------+-------------+----------+------------+---------------+--------------------+
| SVM                 |   0.860631 |    0.860631 | 1        |   0.925096 |      0.856122 | TN: 0  FP: 23

- Accuracy: The ratio of correctly predicted instances to the total instances in a dataset.
- Precision: The ratio of true positive predictions to the total positive predictions (true positive + false positive): Useful when false positives are crucial to avoid
- Recall: The ratio of true positive predictions to the total actual positive instances (true positive + false negative): Usefdul when false negatives are crucial to avoid
- F1 Score: The harmonic mean of precision and recall, providing a balanced measure of both metrics. It considers false positives and false negatives.

 Based on the above code, we can notice that the Random Forest and and fradient boosting classifier had the highest accuracy and CV accuracy, while also having confusion matrices with the main daigonal being heavy. Hence, these can be used for an optimal classification.

### Inferences: 
- This classifier can be used to predict, without having the current voting data, wether a candidate will have thier deposit lost. 
- The RandomForest classifier is the most accurate out of different popular models.

___
## Feature Importance

In [182]:

results = []

models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
]

for model_name, model in models:
    
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    top_features_indices = importances.argsort()[-3:][::-1]
    top_features = [X_train.columns[i] for i in top_features_indices]
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_predict(model, X_train, y_train, cv=kfold)
    cv_accuracy = accuracy_score(y_train, cv_scores)
    
    confusion_str = f"TN: {confusion_mat[0][0]}  FP: {confusion_mat[0][1]}\n" \
                    f"FN: {confusion_mat[1][0]}  TP: {confusion_mat[1][1]}"
    
    results.append([model_name, accuracy, precision, recall, f1, cv_accuracy, confusion_str, ', '.join(top_features)])
    
headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score", "CV Accuracy", "Confusion Matrix", "Top 3 Features"]
print(tabulate(results, headers=headers, tablefmt="grid"))


+-------------------+------------+-------------+----------+------------+---------------+--------------------+----------------------------------+
| Model             |   Accuracy |   Precision |   Recall |   F1 Score |   CV Accuracy | Confusion Matrix   | Top 3 Features                   |
| Random Forest     |   0.967242 |    0.979972 | 0.982007 |   0.980989 |       0.96202 | TN: 205  FP: 29    | No_Terms, Party_Type_TCPD, Party |
|                   |            |             |          |            |               | FN: 26  TP: 1419   |                                  |
+-------------------+------------+-------------+----------+------------+---------------+--------------------+----------------------------------+
| Gradient Boosting |   0.96486  |    0.980583 | 0.978547 |   0.979564 |       0.96202 | TN: 206  FP: 28    | No_Terms, Party_Type_TCPD, Party |
|                   |            |             |          |            |               | FN: 31  TP: 1414   |                     

___
Based on the above code, we can see that for both the chosen classifiers above, The top features for prediction are the same.
### Inferences: 
- No_Terms, which tells us the number of terms won by a candidate in the past, along Party_Type_TCPD and Party are the main indicators of wether a candidate will lose thier deposit.

---
## Final Inferences: 

- This classification problem can be used to predict, without having the current voting data, wether a candidate will have thier deposit lost. 
- The RandomForest classifier is the most accurate out of different popular models.
- No_Terms, which tells us the number of terms won by a candidate in the past, along Party_Type_TCPD and Party are the main indicators of wether a candidate will lose thier deposit.

