# **Classification Model BUilding**

---

## **Import necessary modules**

In [33]:
# basic library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from dotenv.main import load_dotenv

# sql connection library
import mysql.connector as mysql

# feature engineering library
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# model building library
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                            AdaBoostClassifier,
                            GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# model evaluation library
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

# model tuning library
from sklearn.model_selection import GridSearchCV

# save model
import joblib

# load env
load_dotenv()

# ignore warning
warnings.filterwarnings('ignore')


## **Load the data from database**

In [34]:
def load_data():
    # connect to database
    conn = mysql.connect(user = os.getenv('MYSQL_USER'), password = os.getenv('MYSQL_PASSWORD'), host = os.getenv('MYSQL_HOST'), database = os.getenv('MYSQL_DATABASE'))
    # convert to dataframe
    df = pd.read_sql('SELECT * FROM admission', con = conn)
    # close connection
    conn.close()
    return df

df = load_data()

### Show the top 5 rows

In [35]:
df.head()

Unnamed: 0,application_id,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission
0,1,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,2,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,3,Female,True,3.3,Business,,710.0,5.0,Technology,Admit
3,4,Male,False,3.47,STEM,Black,690.0,6.0,Technology,
4,5,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,


---

## **Data Preprocessing**

### Fill the blank values with appropriate labels

In [36]:
df['race'] = df['race'].replace('', 'Unknown')
df['admission'] = df['admission'].replace('', 'Reject')

In [37]:
df.head()

Unnamed: 0,application_id,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission
0,1,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,2,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,Reject
2,3,Female,True,3.3,Business,Unknown,710.0,5.0,Technology,Admit
3,4,Male,False,3.47,STEM,Black,690.0,6.0,Technology,Reject
4,5,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,Reject


In [38]:
df.admission.value_counts()

admission
Reject      5194
Admit        900
Waitlist     100
Name: count, dtype: int64

### Prepare the features and target column

In [70]:
X = df.drop(columns=['admission', 'application_id'], axis = 1)
y = df['admission']

In [71]:
X.head()

Unnamed: 0,gender,international,gpa,major,race,gmat,work_exp,work_industry
0,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services
1,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management
2,Female,True,3.3,Business,Unknown,710.0,5.0,Technology
3,Male,False,3.47,STEM,Black,690.0,6.0,Technology
4,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting


In [72]:
y.value_counts()

admission
Reject      5194
Admit        900
Waitlist     100
Name: count, dtype: int64

### Unique categories in categorical features

In [44]:
categorical_features = X.select_dtypes(include=['object']).columns

for feature in categorical_features:
    print(f'{feature} : {df[feature].unique()}', end='\n\n')

gender : ['Female' 'Male']

international : ['False' 'True']

major : ['Business' 'Humanities' 'STEM']

race : ['Asian' 'Black' 'Unknown' 'Hispanic' 'White' 'Other']

work_industry : ['Financial Services' 'Investment Management' 'Technology' 'Consulting'
 'Nonprofit/Gov' 'PE/VC' 'Health Care' 'Investment Banking' 'Other'
 'Retail' 'Energy' 'CPG' 'Real Estate' 'Media/Entertainment']



> **There are fewer unique categories in each column except for work industry, so we'll try both one-hot and ordinal encoding**

### Handle numeric and categorical features

In [75]:
# create column transformer to preprocess data

# numerical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

# categorical features
categorical_features = X.select_dtypes(include=['object']).columns

df['race'] = df['race'].replace('', 'Unknown')
df['admission'] = df['admission'].replace('', 'Reject')

# numerical pipeline
num_pipeline = Pipeline(
    steps =[

    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())

    ]
)

# categorical pipeline
cat_pipeline = Pipeline(
    steps= [

        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)),
        ('scaler', StandardScaler())

    ]
)

# full pipeline using column transformer
preprocessor = ColumnTransformer(
    [

    ('num', num_pipeline, numeric_features),
    ('cat', cat_pipeline, categorical_features)
    
    ]
)

In [76]:
# fit and transform preprocessor on input data
X = preprocessor.fit_transform(X)

# fit and transform label encoder on target data
le = LabelEncoder()
y = le.fit_transform(y)

In [77]:
y[:25]

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0])

In [78]:
# perform oversampling for training data
sm = SMOTE()

In [53]:
# Ensure categorical features are encoded before resampling
X_res, y_res = sm.fit_resample(X, y)

In [54]:
X_res.shape, y_res.shape

((15582, 25), (15582,))

In [55]:
# get back original data
X_res = pd.DataFrame(X_res, columns = numeric_features.tolist() + list(preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out()))

In [56]:
X_res.head()

Unnamed: 0,gpa,gmat,work_exp,x0_Male,x1_True,x2_Humanities,x2_STEM,x3_Black,x3_Hispanic,x3_Other,...,x4_Health Care,x4_Investment Banking,x4_Investment Management,x4_Media/Entertainment,x4_Nonprofit/Gov,x4_Other,x4_PE/VC,x4_Real Estate,x4_Retail,x4_Technology
0,0.325261,-0.630806,-1.95375,-1.323505,-0.650579,-0.817431,-0.658884,-0.416594,-0.326292,-0.199462,...,-0.23874,-0.321424,-0.165946,-0.098066,-0.342703,-0.270048,-0.414189,-0.135084,-0.073187,-0.361531
1,0.193273,0.586457,-0.016421,0.755569,-0.650579,1.223345,-0.658884,2.400418,-0.326292,-0.199462,...,-0.23874,-0.321424,6.026048,-0.098066,-0.342703,-0.270048,-0.414189,-0.135084,-0.073187,-0.361531
2,0.325261,1.195089,-0.016421,-1.323505,1.537091,-0.817431,-0.658884,-0.416594,-0.326292,-0.199462,...,-0.23874,-0.321424,-0.165946,-0.098066,-0.342703,-0.270048,-0.414189,-0.135084,-0.073187,2.766015
3,1.447159,0.789334,0.952244,0.755569,-0.650579,-0.817431,1.517718,2.400418,-0.326292,-0.199462,...,-0.23874,-0.321424,-0.165946,-0.098066,-0.342703,-0.270048,-0.414189,-0.135084,-0.073187,2.766015
4,0.655231,-1.239437,-0.016421,0.755569,-0.650579,-0.817431,1.517718,-0.416594,3.064738,-0.199462,...,-0.23874,-0.321424,-0.165946,-0.098066,-0.342703,-0.270048,-0.414189,-0.135084,-0.073187,-0.361531


In [57]:
y_res = pd.Series(y_res)

In [58]:
y_res.value_counts()

0    5194
1    5194
2    5194
Name: count, dtype: int64

In [61]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42, stratify=y_res)

In [62]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10907, 25), (4675, 25), (10907,), (4675,))

In [63]:
y_train.value_counts()

1    3636
0    3636
2    3635
Name: count, dtype: int64

---

## **Model Building**

In [65]:
# create a function to evaluate model
def evaluate_model(actual, pred):
    '''
    Function to evaluate model performance

    Args:
    actual : Actual/True label
    pred : Predicted label

    Returns:
    accuracy : accuracy score
    matrix : confusion matrix
    report : classification report
    '''
    accuracy = accuracy_score(actual, pred)
    matrix = confusion_matrix(actual, pred)
    report = classification_report(actual, pred)
    
    return accuracy, matrix, report

### **Before oversampling**

In [16]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    'SVM' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'KNN' : KNeighborsClassifier(),
    'XGBoost' : XGBClassifier()
}

model_list = []
train_accuracy_list = []
test_accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values()) [i]
    model.fit(X_train, y_train)

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate train and test performance
    train_model_accuracy, train_model_matrix, train_model_report = evaluate_model(y_train, y_train_pred)
    test_model_accuracy, test_model_matrix, test_model_report = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print(f"- Accuracy: \n{train_model_accuracy}", end='\n\n')
    print(f"- Confusion matrix: \n{train_model_matrix}", end='\n\n')
    print(f"- Classification report: \n{train_model_report}", end='\n\n')
    train_accuracy_list.append(train_model_accuracy)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print(f"- Accuracy: \n{test_model_accuracy}", end='\n\n')
    print(f"- Confusion matrix: \n {test_model_matrix}", end='\n\n')
    print(f"- Classification report: \n {test_model_report}", end='\n\n')
    test_accuracy_list.append(test_model_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 
0.6192423003543199

- Confusion matrix: 
[[2010  555 1104]
 [ 609 2518  542]
 [ 875  506 2288]]

- Classification report: 
              precision    recall  f1-score   support

           0       0.58      0.55      0.56      3669
           1       0.70      0.69      0.69      3669
           2       0.58      0.62      0.60      3669

    accuracy                           0.62     11007
   macro avg       0.62      0.62      0.62     11007
weighted avg       0.62      0.62      0.62     11007


----------------------------------
Model performance for Test set
- Accuracy: 
0.6503496503496503

- Confusion matrix: 
 [[ 158   55   86]
 [ 241 1038  246]
 [   9   13   13]]

- Classification report: 
               precision    recall  f1-score   support

           0       0.39      0.53      0.45       299
           1       0.94      0.68      0.79      1525
           2       0.04      0.37      0.07        35

    a

---

### **After Oversamling**

In [68]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    'SVM' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'KNN' : KNeighborsClassifier(),
    'XGBoost' : XGBClassifier()
}

model_list = []
train_accuracy_list = []
test_accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values()) [i]
    model.fit(X_train, y_train)

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate train and test performance
    train_model_accuracy, train_model_matrix, train_model_report = evaluate_model(y_train, y_train_pred)
    test_model_accuracy, test_model_matrix, test_model_report = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print(f"- Accuracy: \n{train_model_accuracy}", end='\n\n')
    print(f"- Confusion matrix: \n{train_model_matrix}", end='\n\n')
    print(f"- Classification report: \n{train_model_report}", end='\n\n')
    train_accuracy_list.append(train_model_accuracy)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print(f"- Accuracy: \n{test_model_accuracy}", end='\n\n')
    print(f"- Confusion matrix: \n {test_model_matrix}", end='\n\n')
    print(f"- Classification report: \n {test_model_report}", end='\n\n')
    test_accuracy_list.append(test_model_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 
0.5968643990098103

- Confusion matrix: 
[[2090  542 1004]
 [ 606 2421  609]
 [ 977  659 1999]]

- Classification report: 
              precision    recall  f1-score   support

           0       0.57      0.57      0.57      3636
           1       0.67      0.67      0.67      3636
           2       0.55      0.55      0.55      3635

    accuracy                           0.60     10907
   macro avg       0.60      0.60      0.60     10907
weighted avg       0.60      0.60      0.60     10907


----------------------------------
Model performance for Test set
- Accuracy: 
0.6027807486631016

- Confusion matrix: 
 [[ 893  223  442]
 [ 279 1038  241]
 [ 407  265  887]]

- Classification report: 
               precision    recall  f1-score   support

           0       0.57      0.57      0.57      1558
           1       0.68      0.67      0.67      1558
           2       0.56      0.57      0.57      1559

    a

---

## **Models Results**

In [17]:
results = pd.DataFrame(list(zip(model_list, test_accuracy_list, train_accuracy_list)), columns=['Model', 'Test Accuracy', 'Train Accuracy']).sort_values(by=['Test Accuracy', 'Train Accuracy'], ascending=False)

results

Unnamed: 0,Model,Test Accuracy,Train Accuracy
7,XGBoost,0.829478,0.997093
1,Random Forest,0.819258,1.0
3,Gradient Boosting,0.806885,0.90288
5,Decision Tree,0.797741,1.0
2,AdaBoost,0.77192,0.790406
4,SVM,0.714363,0.904606
6,KNN,0.669177,0.92423
0,Logistic Regression,0.65035,0.619242


---

> **Most of the models are overfitting and LR is underfitting, so we'll choose AdaBoost for this problem statement**

In [18]:
# prediction for actual data
ada_boost = AdaBoostClassifier()
ada_boost.fit(X_train, y_train)

y_pred = ada_boost.predict(X_test)

score = accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy of AdaBoost is {score}', end='\n\n')
print(f'Confusion Matrix: \n {matrix}')

Accuracy of AdaBoost is 0.8187197417966648

Confusion Matrix: 
 [[  66  232    1]
 [  65 1456    4]
 [   6   29    0]]


In [19]:
# prediction for oversampled data
ada_boost = AdaBoostClassifier()
ada_boost.fit(X_train_oversampled, y_train_oversampled)

y_pred = ada_boost.predict(X_test)

score = accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy of AdaBoost is {score}', end='\n\n')
print(f'Confusion Matrix: \n {matrix}')

Accuracy of AdaBoost is 0.7719203873050027

Confusion Matrix: 
 [[ 211   77   11]
 [ 271 1218   36]
 [  12   17    6]]


In [90]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [91]:
X_train, y_train = sm.fit_resample(X_train, y_train)

In [92]:
X_train = pd.DataFrame(X_train, columns = numeric_features.tolist() + list(preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out()))

y_train = pd.Series(y_train)

In [93]:
y_train.value_counts()

0    3669
1    3669
2    3669
Name: count, dtype: int64

In [94]:
y_test = pd.Series(y_test)

In [95]:
y_test.value_counts()

1    1525
0     299
2      35
Name: count, dtype: int64

In [96]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    'SVM' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'KNN' : KNeighborsClassifier(),
    'XGBoost' : XGBClassifier()
}

model_list = []
train_accuracy_list = []
test_accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values()) [i]
    model.fit(X_train, y_train)

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate train and test performance
    train_model_accuracy, train_model_matrix, train_model_report = evaluate_model(y_train, y_train_pred)
    test_model_accuracy, test_model_matrix, test_model_report = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print(f"- Accuracy: \n{train_model_accuracy}", end='\n\n')
    print(f"- Confusion matrix: \n{train_model_matrix}", end='\n\n')
    print(f"- Classification report: \n{train_model_report}", end='\n\n')
    train_accuracy_list.append(train_model_accuracy)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print(f"- Accuracy: \n{test_model_accuracy}", end='\n\n')
    print(f"- Confusion matrix: \n {test_model_matrix}", end='\n\n')
    print(f"- Classification report: \n {test_model_report}", end='\n\n')
    test_accuracy_list.append(test_model_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 
0.6133369673843917

- Confusion matrix: 
[[1997  588 1084]
 [ 611 2427  631]
 [ 902  440 2327]]

- Classification report: 
              precision    recall  f1-score   support

           0       0.57      0.54      0.56      3669
           1       0.70      0.66      0.68      3669
           2       0.58      0.63      0.60      3669

    accuracy                           0.61     11007
   macro avg       0.62      0.61      0.61     11007
weighted avg       0.62      0.61      0.61     11007


----------------------------------
Model performance for Test set
- Accuracy: 
0.64228079612695

- Confusion matrix: 
 [[ 155   56   88]
 [ 236 1029  260]
 [  10   15   10]]

- Classification report: 
               precision    recall  f1-score   support

           0       0.39      0.52      0.44       299
           1       0.94      0.67      0.78      1525
           2       0.03      0.29      0.05        35

    acc