In [1]:
import numpy as np
import pandas as pd

In [2]:
# Importing the datasets for analysis
X_df = pd.read_csv('./DS Test/Training/X_train.csv')
y_df = pd.read_csv('./DS Test/Training/y_train.csv')
X_test_df = pd.read_csv('./DS Test/Test/X_test.csv')
print(X_df.shape)
print(X_test_df.shape)

(33050, 44)
(11017, 44)


In [3]:
X_df.drop(['Unique_ID'], axis=1, inplace=True)
X_test_df.drop(['Unique_ID'], axis=1, inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns
categorical_columns = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8']

# Apply label encoding to categorical columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    X_df[column] = label_encoder.fit_transform(X_df[column])
    X_test_df[column] = label_encoder.fit_transform(X_test_df[column])

In [5]:
# Concatenate X_df and X_test_df for imputation of missing values and feature scaling
concatenated_df = pd.concat([X_df, X_test_df], ignore_index=True)

In [6]:
# Columns for mean imputation
columns_to_impute = ['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10',
                     'N10.1', 'N11', 'N12', 'N14', 'N15', 'N16', 'N17', 'N18', 'N19',
                     'N20', 'N21', 'N22', 'N23', 'N24', 'N25', 'N26', 'N27', 'N28',
                     'N29', 'N30', 'N31', 'N32', 'N33', 'N34', 'N35']

# Apply mean imputation to the selected columns
concatenated_df[columns_to_impute] = concatenated_df[columns_to_impute].fillna(concatenated_df[columns_to_impute].mean())

In [7]:
# Count the number of rows with missing values
num_missing_rows = concatenated_df.isnull().any(axis=1).sum()
print("Number of rows with missing values in training dataset:", num_missing_rows)

Number of rows with missing values in training dataset: 0


In [8]:
from sklearn.preprocessing import StandardScaler

numerical_columns = concatenated_df.loc[:, 'N1':'N35']

# Apply scaling to the numerical columns
scaler = StandardScaler()
scaled_numerical_columns = scaler.fit_transform(numerical_columns)

# Create a new dataframe with the scaled numerical columns
df_scaled = pd.DataFrame(scaled_numerical_columns, columns=numerical_columns.columns)

# Concatenate the scaled numerical columns with the remaining columns
df_scaled = pd.concat([concatenated_df.drop(numerical_columns.columns, axis=1), df_scaled], axis=1)

# Print the scaled dataframe
display(df_scaled)

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,N1,N2,...,N26,N27,N28,N29,N30,N31,N32,N33,N34,N35
0,0,0,11,31,0,0,0,1,0.461130,0.000000,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,-0.646694,-0.594962,-1.039876
1,0,4,2,66,2,0,1,1,-1.102818,-0.751973,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,0.330181,0.203522,-0.355033
2,0,0,19,2,0,0,0,1,1.107643,0.000000,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,-0.972319,-0.933772,-0.218065
3,0,1,16,47,1,0,4,1,-0.248188,-1.018150,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,-0.531768,-0.524462,1.014652
4,0,1,13,1,1,1,6,1,0.922925,1.580243,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,-0.244452,-0.100547,0.192841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44062,0,0,3,0,2,1,1,1,-0.616393,-0.397070,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,2.628709,2.518864,0.055872
44063,0,0,0,40,0,0,0,1,-0.246957,0.000000,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,-1.010628,-1.009695,-1.313813
44064,0,19,1,24,1,1,4,1,1.448756,-0.270320,...,-5.108466e-16,0.000000,1.184997e-16,5.850742e-17,-3.249780e-16,0.000000,0.000000,-0.435996,-0.271080,0.329809
44065,0,1,19,1,2,0,4,1,-0.246957,0.452160,...,-1.304050e+00,-1.251381,-5.757522e-01,-2.140791e-01,-2.123409e+00,-1.609617,-1.132562,-0.972319,-0.970874,0.877684


In [9]:
X_df = df_scaled.iloc[:33050].copy()

X_test_df = df_scaled.iloc[33050:].copy()
# Reset the index of the dataframe
X_test_df = X_test_df.reset_index(drop=True)

In [10]:
class_counts = y_df['Dependent_Variable'].value_counts()
print(class_counts)

0    22844
1    10206
Name: Dependent_Variable, dtype: int64


In [11]:
from imblearn.over_sampling import SMOTE

# Separate the features and target variable
y = y_df['Dependent_Variable']

# Instantiate the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the dataset
X, y = smote.fit_resample(X_df, y)

# Check the class distribution after applying SMOTE
class_counts_resampled = pd.Series(y).value_counts()
print(class_counts_resampled)

1    22844
0    22844
Name: Dependent_Variable, dtype: int64


In [12]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression

# Define the RandomForestClassifier
lr_classifier = LogisticRegression()

# Define the hyperparameters for grid search
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 10],
              'solver': ['liblinear', 'saga']}

# Define the scoring metrics
scoring = {
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1-Score': make_scorer(f1_score),
    'AUC-ROC': make_scorer(roc_auc_score),
    'Accuracy': make_scorer(accuracy_score)
}

# Perform grid search with k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(lr_classifier, param_grid, cv=kfold, scoring=scoring, refit='AUC-ROC')

# Fit the model using grid search
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding scores
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best AUC-ROC Score: ", grid_search.best_score_)
print("Best Precision Score: ", grid_search.cv_results_['mean_test_Precision'][grid_search.best_index_])
print("Best Recall Score: ", grid_search.cv_results_['mean_test_Recall'][grid_search.best_index_])
print("Best F1-Score: ", grid_search.cv_results_['mean_test_F1-Score'][grid_search.best_index_])
print("Best Accuracy: ", grid_search.cv_results_['mean_test_Accuracy'][grid_search.best_index_])



Best Hyperparameters:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best AUC-ROC Score:  0.6874720310930547
Best Precision Score:  0.6824201680721755
Best Recall Score:  0.7014026252100748
Best F1-Score:  0.6917776948995223
Best Accuracy:  0.6874890845327083


In [13]:
from sklearn.neighbors import KNeighborsClassifier

# Define the KNeighborsClassifier
knn_classifier = KNeighborsClassifier()

# Define the hyperparameters for grid search
param_grid = {'n_neighbors': [3, 5, 7, 10],
              'weights': ['uniform', 'distance']}

# Define the scoring metrics
scoring = {
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1-Score': make_scorer(f1_score),
    'AUC-ROC': make_scorer(roc_auc_score),
    'Accuracy': make_scorer(accuracy_score)
}

# Perform grid search with k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(knn_classifier, param_grid, cv=kfold, scoring=scoring, refit='AUC-ROC')

# Fit the model using grid search
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding scores
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best AUC-ROC Score: ", grid_search.best_score_)
print("Best Precision Score: ", grid_search.cv_results_['mean_test_Precision'][grid_search.best_index_])
print("Best Recall Score: ", grid_search.cv_results_['mean_test_Recall'][grid_search.best_index_])
print("Best F1-Score: ", grid_search.cv_results_['mean_test_F1-Score'][grid_search.best_index_])
print("Best Accuracy: ", grid_search.cv_results_['mean_test_Accuracy'][grid_search.best_index_])

Best Hyperparameters:  {'n_neighbors': 3, 'weights': 'distance'}
Best AUC-ROC Score:  0.765140012207626
Best Precision Score:  0.7039762463866049
Best Recall Score:  0.9151163078303884
Best F1-Score:  0.7957602083996563
Best Accuracy:  0.7651245493293847


In [16]:
from sklearn.ensemble import RandomForestClassifier

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Define the hyperparameters for grid search
param_grid = {'n_estimators': [100, 200, 300], 
              'max_depth': [None, 5, 10], 
              'min_samples_split': [2, 5, 10]}

# Define the scoring metrics
scoring = {
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1-Score': make_scorer(f1_score),
    'AUC-ROC': make_scorer(roc_auc_score),
    'Accuracy': make_scorer(accuracy_score)
}

# Perform grid search with k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=kfold, scoring=scoring, refit='AUC-ROC')

# Fit the model using grid search
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding scores
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best AUC-ROC Score: ", grid_search.best_score_)
print("Best Precision Score: ", grid_search.cv_results_['mean_test_Precision'][grid_search.best_index_])
print("Best Recall Score: ", grid_search.cv_results_['mean_test_Recall'][grid_search.best_index_])
print("Best F1-Score: ", grid_search.cv_results_['mean_test_F1-Score'][grid_search.best_index_])
print("Best Accuracy: ", grid_search.cv_results_['mean_test_Accuracy'][grid_search.best_index_])

Best Hyperparameters:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best AUC-ROC Score:  0.8212658756544957
Best Precision Score:  0.8477399475989287
Best Recall Score:  0.783277128497115
Best F1-Score:  0.8142155812022421
Best Accuracy:  0.8212659017294028


In [25]:
proba_estimates = grid_search.predict_proba(X_test_df)[:, 1]

In [26]:
X_t = pd.read_csv('./DS Test/Test/X_test.csv')
y_test_df = pd.DataFrame()
y_test_df['Unique_ID'] = X_t['Unique_ID']
y_test_df['Class_1_Probability'] = proba_estimates
y_test_df

Unnamed: 0,Unique_ID,Class_1_Probability
0,Candidate_1602,0.533333
1,Candidate_29650,0.390000
2,Candidate_31061,0.420000
3,Candidate_5768,0.220000
4,Candidate_27059,0.463333
...,...,...
11012,Candidate_7453,0.523333
11013,Candidate_38211,0.320000
11014,Candidate_25020,0.456667
11015,Candidate_44501,0.170000


In [32]:
y_test_df[y_test_df['Class_1_Probability'] > 0.5].count()

Unique_ID              2507
Class_1_Probability    2507
dtype: int64

In [27]:
#y_test_df.to_csv('./DS Test/Test/final_predictions_1.csv', index=False)