In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


In [6]:
data = pd.read_csv('/content/drive/MyDrive/ST4052/project1_Advanced/pls_df.csv')

In [5]:
data.shape


(58592, 46)

In [7]:
#feature set
X = data.drop('is_claim',axis = 1)

In [7]:
X.shape

(58592, 45)

In [8]:
#response
y = data['is_claim']

In [8]:
y.shape

(58592,)

In [9]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting Models

#### 1. Origrinal data set

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score


# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Fit each classifier and compute evaluation metrics
for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train, y_train_pred)
    test_precision = precision_score(y_test, y_test_pred)

    train_recall = recall_score(y_train, y_train_pred)
    test_recall = recall_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train, y_train_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    train_f2 = fbeta_score(y_train, y_train_pred, beta=2)
    test_f2 = fbeta_score(y_test, y_test_pred, beta=2)

    results[clf_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train Precision': train_precision,
        'Test Precision': test_precision,
        'Train Recall': train_recall,
        'Test Recall': test_recall,
        'Train F1-Score': train_f1,
        'Test F1-Score': test_f1,
        'Train F2-Score': train_f2,
        'Test F2-Score': test_f2
    }

# Create a table for evaluation metrics
metrics_table = pd.DataFrame(results).T

print("Evaluation Metrics (Training and Test Data):")
print(metrics_table)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Metrics (Training and Test Data):
                     Train Accuracy  Test Accuracy  Train Precision  \
Logistic Regression        0.936168       0.935489         0.000000   
Naive Bayes                0.896358       0.899650         0.073193   
K-Nearest Neighbors        0.936979       0.932503         0.620253   
Random Forest              0.999893       0.931137         1.000000   
XGBoost                    0.937448       0.935319         1.000000   

                     Test Precision  Train Recall  Test Recall  \
Logistic Regression        0.000000      0.000000     0.000000   
Naive Bayes                0.073171      0.053476     0.047619   
K-Nearest Neighbors        0.111111      0.032754     0.006614   
Random Forest              0.067797      0.998329     0.005291   
XGBoost                    0.000000      0.020053     0.000000   

                     Train F1-Score  Test F1-Score  Train F2-Score  \
Logistic Regression        0.000000       0.000000        0.0

#### 2. Balancing with SMOTE

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Fit each classifier and compute evaluation metrics
for clf_name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_train_pred = clf.predict(X_train_resampled)
    y_test_pred = clf.predict(X_test)
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train_resampled, y_train_pred)
    test_precision = precision_score(y_test, y_test_pred)

    train_recall = recall_score(y_train_resampled, y_train_pred)
    test_recall = recall_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train_resampled, y_train_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    train_f2 = fbeta_score(y_train_resampled, y_train_pred, beta=2)
    test_f2 = fbeta_score(y_test, y_test_pred, beta=2)

    results[clf_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train Precision': train_precision,
        'Test Precision': test_precision,
        'Train Recall': train_recall,
        'Test Recall': test_recall,
        'Train F1-Score': train_f1,
        'Test F1-Score': test_f1,
        'Train F2-Score': train_f2,
        'Test F2-Score': test_f2
    }

# Create a table for evaluation metrics
metrics_table = pd.DataFrame(results).T

print("Evaluation Metrics (Training and Test Data):")
print(metrics_table)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation Metrics (Training and Test Data):
                     Train Accuracy  Test Accuracy  Train Precision  \
Logistic Regression        0.585800       0.572148         0.582392   
Naive Bayes                0.536269       0.401485         0.527603   
K-Nearest Neighbors        0.894294       0.709788         0.838445   
Random Forest              0.999989       0.898114         0.999977   
XGBoost                    0.962706       0.932844         0.998160   

                     Test Precision  Train Recall  Test Recall  \
Logistic Regression        0.082057      0.606481     0.552910   
Naive Bayes                0.069601      0.693239     0.669312   
K-Nearest Neighbors        0.080824      0.976801     0.337302   
Random Forest              0.103261      1.000000     0.075397   
XGBoost                    0.121951      0.927121     0.006614   

                     Train F1-Score  Test F1-Score  Train F2-Score  \
Logistic Regression        0.594193       0.142906        0.6

#### 2. Balancing with OverSampling

In [16]:
from imblearn.over_sampling import RandomOverSampler

# Apply Random Oversampling to balance the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Fit each classifier and compute evaluation metrics
for clf_name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_train_pred = clf.predict(X_train_resampled)
    y_test_pred = clf.predict(X_test)
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train_resampled, y_train_pred)
    test_precision = precision_score(y_test, y_test_pred)

    train_recall = recall_score(y_train_resampled, y_train_pred)
    test_recall = recall_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train_resampled, y_train_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    train_f2 = fbeta_score(y_train_resampled, y_train_pred, beta=2)
    test_f2 = fbeta_score(y_test, y_test_pred, beta=2)

    results[clf_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train Precision': train_precision,
        'Test Precision': test_precision,
        'Train Recall': train_recall,
        'Test Recall': test_recall,
        'Train F1-Score': train_f1,
        'Test F1-Score': test_f1,
        'Train F2-Score': train_f2,
        'Test F2-Score': test_f2
    }

# Create a table for evaluation metrics
metrics_table = pd.DataFrame(results).T

print("Evaluation Metrics (Training and Test Data):")
print(metrics_table)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation Metrics (Training and Test Data):
                     Train Accuracy  Test Accuracy  Train Precision  \
Logistic Regression        0.579100       0.566772         0.575968   
Naive Bayes                0.525945       0.364622         0.518909   
K-Nearest Neighbors        0.941056       0.785391         0.894545   
Random Forest              0.999989       0.920300         0.999977   
XGBoost                    0.810613       0.696134         0.770823   

                     Test Precision  Train Recall  Test Recall  \
Logistic Regression        0.082512      0.599713     0.564815   
Naive Bayes                0.068943      0.711994     0.707672   
K-Nearest Neighbors        0.080591      1.000000     0.223545   
Random Forest              0.087963      1.000000     0.025132   
XGBoost                    0.100086      0.884073     0.464286   

                     Train F1-Score  Test F1-Score  Train F2-Score  \
Logistic Regression        0.587601       0.143989        0.5

#### 3.Balancing With UnderSampling

In [20]:
from imblearn.under_sampling import RandomUnderSampler

# Apply Random Undersampling to the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Initialize a dictionary to store results
results = {}

# Fit each classifier and compute evaluation metrics
for clf_name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_train_pred = clf.predict(X_train_resampled)
    y_test_pred = clf.predict(X_test)
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train_resampled, y_train_pred)
    test_precision = precision_score(y_test, y_test_pred)

    train_recall = recall_score(y_train_resampled, y_train_pred)
    test_recall = recall_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train_resampled, y_train_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    train_f2 = fbeta_score(y_train_resampled, y_train_pred, beta=2)
    test_f2 = fbeta_score(y_test, y_test_pred, beta=2)

    results[clf_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train Precision': train_precision,
        'Test Precision': test_precision,
        'Train Recall': train_recall,
        'Test Recall': test_recall,
        'Train F1-Score': train_f1,
        'Test F1-Score': test_f1,
        'Train F2-Score': train_f2,
        'Test F2-Score': test_f2
    }

# Create a table for evaluation metrics
metrics_table = pd.DataFrame(results).T

print("Evaluation Metrics (Training and Test Data with Random Undersampling):")
print(metrics_table)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation Metrics (Training and Test Data with Random Undersampling):
                     Train Accuracy  Test Accuracy  Train Precision  \
Logistic Regression        0.576872       0.556447         0.573576   
Naive Bayes                0.535261       0.420599         0.528599   
K-Nearest Neighbors        0.709726       0.531018         0.704198   
Random Forest              1.000000       0.578462         1.000000   
XGBoost                    0.858122       0.556873         0.842004   

                     Test Precision  Train Recall  Test Recall  \
Logistic Regression        0.081101      0.599265     0.568783   
Naive Bayes                0.070839      0.651738     0.658730   
K-Nearest Neighbors        0.074201      0.723262     0.546296   
Random Forest              0.086070      1.000000     0.575397   
XGBoost                    0.083849      0.881684     0.591270   

                     Train F1-Score  Test F1-Score  Train F2-Score  \
Logistic Regression        0.586139

*  Now we selected Random forest with SMOTE balancing has some higher performance

### Now we do some cost sensitive learning to improve F1 score

In [23]:
# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create a Random Forest classifier with class weights
class_weights = {0: 1, 1: 1000}  # You can adjust the weight for each class as needed
rf_classifier = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Fit the model to your resampled training data
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on both train and test data
y_train_pred = rf_classifier.predict(X_train_resampled)
y_test_pred = rf_classifier.predict(X_test)

# Calculate accuracy, F1, and F2 scores for both train and test sets
accuracy_train = accuracy_score(y_train_resampled, y_train_pred)
f1_train = f1_score(y_train_resampled, y_train_pred)
f2_train = fbeta_score(y_train_resampled, y_train_pred, beta=2)

accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
f2_test = fbeta_score(y_test, y_test_pred, beta=2)

# Create a table for the results
results_table = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'F2-Score'],
    'Train': [accuracy_train, f1_train, f2_train],
    'Test': [accuracy_test, f1_test, f2_test]
})

print("Results:")
print(results_table)


Results:
     Metric  Train      Test
0  Accuracy    1.0  0.909293
1  F1-Score    1.0  0.068361
2  F2-Score    1.0  0.057202


### Fit SVM and for SMOTE Dataset and Evaluate

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create an SVM classifier
svm_classifier = SVC()

# Fit the SVM model to the resampled training data
svm_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on both train and test data
y_train_pred = svm_classifier.predict(X_train_resampled)
y_test_pred = svm_classifier.predict(X_test)

# Calculate accuracy, precision, recall, F1, and F2 scores for training data
accuracy_train = accuracy_score(y_train_resampled, y_train_pred)
precision_train = precision_score(y_train_resampled, y_train_pred)
recall_train = recall_score(y_train_resampled, y_train_pred)
f1_train = f1_score(y_train_resampled, y_train_pred)
f2_train = fbeta_score(y_train_resampled, y_train_pred, beta=2)

# Calculate accuracy, precision, recall, F1, and F2 scores for test data
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
f2_test = fbeta_score(y_test, y_test_pred, beta=2)

# Create a table for the results
results_table = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'F2-Score'],
    'Training Data': [accuracy_train, precision_train, recall_train, f1_train, f2_train],
    'Test Data': [accuracy_test, precision_test, recall_test, f1_test, f2_test]
})

print("Results:")
print(results_table)


### Fit RF model with hyper parameter tuning

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define hyperparameters for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    #'class_weight': [{0: 1, 1: 10}, {0: 1, 1: 15}]  # You can adjust class weights as needed
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator from the hyperparameter tuning
best_rf_classifier = grid_search.best_estimator_

# Make predictions on both train and test data
y_train_pred = best_rf_classifier.predict(X_train_resampled)
y_test_pred = best_rf_classifier.predict(X_test)

# Calculate accuracy, precision, recall, F1, and F2 scores for training data
accuracy_train = accuracy_score(y_train_resampled, y_train_pred)
precision_train = precision_score(y_train_resampled, y_train_pred)
recall_train = recall_score(y_train_resampled, y_train_pred)
f1_train = f1_score(y_train_resampled, y_train_pred)
f2_train = fbeta_score(y_train_resampled, y_train_pred, beta=2)

# Calculate accuracy, precision, recall, F1, and F2 scores for test data
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
f2_test = fbeta_score(y_test, y_test_pred, beta=2)

# Create a table for the results
results_table = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'F2-Score'],
    'Training Data': [accuracy_train, precision_train, recall_train, f1_train, f2_train],
    'Test Data': [accuracy_test, precision_test, recall_test, f1_test, f2_test]
})

print("Results:")
print(results_table)


KeyboardInterrupt: ignored