<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">


# DSI-SG-42 Project 3: Web APIs & NLP
### Reddit Scams: Are We Vulnerable?
---

## 2. Data Cleaning

### 2.1 Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### 2.2 Import scraped dataset

In [None]:
df = pd.read_csv('final_dataset.csv')
df.head()

Unnamed: 0,age,height,weight,bmi,yrssmok,packday,yrsquit,sleep_hours,health_status,phys_health_not_good,...,colon_sigmoidoscopy,asthma_status,race_ethnicity,sex,education,income,smoker_status,e_cig_smoker,binge_drinker,heavy_drinker
0,80.0,1.698,82.372,21.166,0.0,0.0,0.0,8.0,2.0,1.0,...,1.0,3.0,1.0,2.0,4.0,7.0,4.0,1.0,1.0,1.0
1,80.0,1.6,68.04,26.58,0.0,0.0,0.0,6.0,1.0,1.0,...,1.0,3.0,1.0,2.0,2.0,5.0,4.0,1.0,1.0,1.0
2,56.0,1.57,63.5,25.76,0.0,0.0,0.0,5.0,2.0,2.0,...,2.0,3.0,1.0,2.0,4.0,10.0,4.0,1.0,1.0,1.0
3,73.0,1.65,63.5,23.32,56.0,0.1,8.2,7.0,1.0,1.0,...,1.0,1.0,1.0,2.0,2.0,7.0,2.0,1.0,1.0,1.0
4,43.0,1.57,53.98,21.9,0.0,0.0,0.0,9.0,4.0,2.0,...,999.0,3.0,1.0,2.0,3.0,5.0,4.0,1.0,1.0,1.0


### 2.3 First look at data

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60821 entries, 0 to 60820
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        60821 non-null  float64
 1   height                     60821 non-null  float64
 2   weight                     60821 non-null  float64
 3   bmi                        60821 non-null  float64
 4   yrssmok                    60821 non-null  float64
 5   packday                    60821 non-null  float64
 6   yrsquit                    60821 non-null  float64
 7   sleep_hours                60821 non-null  float64
 8   health_status              60821 non-null  float64
 9   phys_health_not_good       60821 non-null  float64
 10  mental_health_not_good     60821 non-null  float64
 11  last_routine_checkup       60821 non-null  float64
 12  visit_dentist_past_year    60821 non-null  float64
 13  health_insurance           60821 non-null  flo

### 2.7 Check for null values

In [None]:
df.isnull().sum()

age                          0
height                       0
weight                       0
bmi                          0
yrssmok                      0
packday                      0
yrsquit                      0
sleep_hours                  0
health_status                0
phys_health_not_good         0
mental_health_not_good       0
last_routine_checkup         0
visit_dentist_past_year      0
health_insurance             0
phy_exercise_past_30_days    0
stroke                       0
cancer                       0
kidney_disease               0
colon_sigmoidoscopy          0
asthma_status                0
race_ethnicity               0
sex                          0
education                    0
income                       0
smoker_status                1
e_cig_smoker                 1
binge_drinker                1
heavy_drinker                1
dtype: int64

### 2.8 Initial Modelling (Baseline Scores only)

In [None]:
# Check if there are any missing values in the dataset
if df.isnull().sum().any():
    print("There are missing values in the dataset.")
else:
    print("There are no missing values in the dataset.")

There are missing values in the dataset.


#### 2.8.1 Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

In [None]:
# Separate features and target
X = df.drop('chd_mi', axis=1)
y = df['chd_mi'].astype(int)

# Map the values of y from [1, 2] to [0, 1]
y_mapped = y.map({1: 0, 2: 1})

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.2, stratify=y_mapped, random_state=42)

In [None]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

# Identify numerical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
# Create the preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a preprocessing and modeling pipeline
logreg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', LogisticRegression(max_iter=1000, random_state=42))])

# Train the model
logreg_pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_train_pred = logreg_pipeline.predict(X_train)
y_test_pred = logreg_pipeline.predict(X_test)

# Calculate and print the accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Perform 5-fold cross-validation and print the average accuracy
cv_scores = cross_val_score(logreg_pipeline, X, y_mapped, cv=5, scoring='accuracy')
print(f"CV Mean Score: {cv_scores.mean() * 100:.2f}%")
print(f"CV Scores per Fold: {[f'{score * 100:.2f}%' for score in cv_scores]}")

#### 2.8.1 XGBoost

In [None]:
pip install xgboost



In [None]:
# The column name in the dropna() method should match the column you're intending to use
df = df.dropna(subset=['chd_mi'])  # Make sure 'chd_mi' is the correct column name

In [None]:
# Separate features and target
X = df.drop('chd_mi', axis=1)
y = df['chd_mi'].astype(int)

# Map the values of y from [1, 2] to [0, 1]
y_mapped = y.map({1: 0, 2: 1})

# Verify the consistency in the number of samples between X and y_mapped
assert len(X) == len(y_mapped), "The feature set X and target variable y_mapped have inconsistent lengths."

# Now, you can safely perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.2, stratify=y_mapped, random_state=42)

In [None]:
# Initialize the XGBoost classifier with enable_categorical=True
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', enable_categorical=True)

# Train the model using the correct y_train variable
model.fit(X_train, y_train)  # Use y_train directly after ensuring it's correctly mapped and split

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)  # Use y_test which corresponds to the split and mapped target variable
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 91.09%


In [None]:
from sklearn.model_selection import cross_val_score

# Make predictions on training and test sets
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Calculate and print training and test accuracies
# Here, make sure to use 'y_train' and 'y_test' which are the variables you should have defined after the train-test split and mapping
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Ensure X is suitable for cross-validation by converting object types to 'category' if needed
# This step might not be necessary for models like XGBoost when using enable_categorical=True
# but is kept for demonstration or if you plan to use models that do not natively support categorical features
X_for_cv = X.copy()
for col in X_for_cv.columns:
    if X_for_cv[col].dtype == 'object':
        X_for_cv[col] = X_for_cv[col].astype('category')

# Perform 5-fold cross-validation using the mapped y
cv_scores = cross_val_score(model, X_for_cv, y_mapped, cv=5, scoring='accuracy')

# Print the average of the cross-validation scores and the scores for each fold
print(f"CV Mean Score: {cv_scores.mean() * 100:.2f}%")
print(f"CV Scores per Fold: {[f'{score * 100:.2f}%' for score in cv_scores]}")

Train Accuracy: 91.61%
Test Accuracy: 91.09%
CV Mean Score: 91.08%
CV Scores per Fold: ['91.05%', '91.09%', '91.05%', '91.08%', '91.11%']


#### 2.8.2 Random Forest

In [None]:
pip install scikit-learn



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Separate features and target
X = df.drop('chd_mi', axis=1)
y = df['chd_mi']

# Transform 'X'
X = X.fit_transform(X)

X_encoded = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Initialize the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
# Predictions for evaluation
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Train and Test accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Preprocessing for CV score
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category')

# Perform 5-fold cross-validation to evaluate the model
cv_scores = cross_val_score(model, X_encoded, y, cv=5)

# Calculate the mean and standard deviation of the cross-validation scores
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()
print(f"CV Mean Score: {cv_mean * 100:.2f}%")
print(f"CV Standard Deviation: {cv_std * 100:.2f}%")

Train Accuracy: 99.97%
Test Accuracy: 91.02%
CV Mean Score: 90.99%
CV Standard Deviation: 0.02%


#### 2.8.3 Decision Tree

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

# Correctly create a preprocessing pipeline for numerical and categorical data
numeric_transformer = Pipeline(steps=[
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Add OneHotEncoder for categorical data
])

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# Proceed with training and evaluating the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Preprocessing for CV score
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category')

cv_scores = cross_val_score(model, X_preprocessed, y, cv=5)
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()
print(f"CV Mean Score: {cv_mean * 100:.2f}%")
print(f"CV Standard Deviation: {cv_std * 100:.2f}%")


Train Accuracy: 99.98%
Test Accuracy: 85.04%
CV Mean Score: 85.18%
CV Standard Deviation: 0.16%


### 2.10 Modelling (with parameters)

We will only run the models on df2. The reason being - when we dropped values, there were 0 rows for df1 - hence there is no point running model on an empty dataframe.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

#### Instantiating Pipeline

In [None]:
# Logistic Regression Pipeline
imb_logistic_pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),  # Add SMOTE for oversampling
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

# XGBoost Pipeline
imb_xgb_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Random Forest Pipeline
imb_rf_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

#### Parameter Grids

In [None]:
logistic_param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__penalty': ['l2'],
    'model__solver': ['saga'],
    'model__class_weight': [None, 'balanced'],  # Including class_weight
    'model__max_iter': [100, 1000, 5000]
}

In [None]:
# Calculate class weights if your data is imbalanced
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

# Define the parameter grid
xgb_param_grid = {
    'model__max_depth': [5, 7],
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1],
    'model__scale_pos_weight': [1, scale_pos_weight],  # Use the ratio
    'model__subsample': [0.7, 1.0],
    'model__colsample_bytree': [0.7],
    'model__gamma': [0, 0.1]
}

In [None]:
rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__class_weight': [None, 'balanced', 'balanced_subsample'],
    'model__max_features': ['auto']
}

#### GridSearchCV Execution

In [None]:
%%time

# Logistic Regression GridSearchCV
logistic_grid_search = GridSearchCV(imb_logistic_pipeline, logistic_param_grid, cv=5, scoring='accuracy')
logistic_grid_search.fit(X_train, y_train)

# Best parameters and CV score
print("Best parameters for Logistic Regression:", logistic_grid_search.best_params_)
print("Best CV score for Logistic Regression:", logistic_grid_search.best_score_)

# Evaluate on training data using the best estimator found by GridSearchCV
y_train_pred = logistic_grid_search.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train accuracy for Logistic Regression:", train_accuracy)

# Evaluate on test data using the best estimator found by GridSearchCV
y_test_pred = logistic_grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy for Logistic Regression:", test_accuracy)

# If you want to report the CV scores detail for the best model, you can do it by accessing cv_results_
# Here's how to get the mean CV score for the best estimator across folds
best_index1 = logistic_grid_search.best_index_
mean_cv_score = logistic_grid_search.cv_results_['mean_test_score'][best_index1]
std_cv_score = logistic_grid_search.cv_results_['std_test_score'][best_index1]
print(f"Mean CV score for the best Logistic Regression model: {mean_cv_score:.4f} ± {std_cv_score:.4f}")


Best parameters for Logistic Regression: {'model__C': 0.1, 'model__class_weight': None, 'model__max_iter': 100, 'model__penalty': 'l2', 'model__solver': 'saga'}
Best CV score for Logistic Regression: 0.7433510919429596
Train accuracy for Logistic Regression: 0.7434675422053577
Test accuracy for Logistic Regression: 0.7445213182918101
Mean CV score for the best Logistic Regression model: 0.7434 ± 0.0017


In [None]:
%%time

# XGBoost GridSearchCV
xgb_grid_search = GridSearchCV(imb_xgb_pipeline, xgb_param_grid, cv=5, scoring='accuracy')
xgb_grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)
print("Best score for XGBoost:", xgb_grid_search.best_score_)

# Evaluate on training data using the best estimator found by GridSearchCV
y_train_pred = xgb_grid_search.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train accuracy for XGBoost:", train_accuracy)

# Evaluate on test data using the best estimator found by GridSearchCV
y_test_pred = xgb_grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy for XGBoost:", test_accuracy)

# If you want to report the CV scores detail for the best model, you can do it by accessing cv_results_
best_index = xgb_grid_search.best_index_
mean_cv_score = xgb_grid_search.cv_results_['mean_test_score'][best_index]
std_cv_score = xgb_grid_search.cv_results_['std_test_score'][best_index]
print(f"Mean CV score for the best XGBoost model: {mean_cv_score:.4f} ± {std_cv_score:.4f}")

In [None]:
%%time

# Random Forest GridSearchCV
rf_grid_search = GridSearchCV(imb_rf_pipeline, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Random Forest:", rf_grid_search.best_params_)
print("Best score for Random Forest:", rf_grid_search.best_score_)

# Evaluate on training data using the best estimator found by GridSearchCV
y_train_pred = rf_grid_search.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train accuracy for Random Forest:", train_accuracy)

# Evaluate on test data using the best estimator found by GridSearchCV
y_test_pred = rf_grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy for Random Forest:", test_accuracy)

# If you want to report the CV scores detail for the best model, you can do it by accessing cv_results_
# Here's how to get the mean CV score for the best estimator across folds
best_index = rf_grid_search.best_index_
mean_cv_score = rf_grid_search.cv_results_['mean_test_score'][best_index]
std_cv_score = rf_grid_search.cv_results_['std_test_score'][best_index]
print(f"Mean CV score for the best Random Forest model: {mean_cv_score:.4f} ± {std_cv_score:.4f}")


**Analysis:**

Regarding the parameters hypertuned, please consider the following factors

1) Parameter Grids:

* The parameter grids for each model are relatively comprehensive but focused. This approach is a balance between exploring a range of parameter values and computational efficiency.
* For Logistic Regression and Random Forest, the parameters explored offer a good variety without being overly exhaustive. The choices for model__C, model__max_iter, and other parameters are within common ranges that typically yield good results.
* The SVM grid is more constrained, focusing on the RBF kernel and a narrower range of model__C and model__gamma values. This limitation is likely a practical decision to manage computational demand, as SVMs can become quite resource-intensive with larger parameter spaces and datasets.
* XGBoost's parameter grid is targeted, with a selection of parameters that are most influential on model performance (model__max_depth, model__n_estimators, etc.). The choices indicate an attempt to fine-tune the model around commonly effective values.

2) Performance Metrics:

* The performance metrics across models are very close, indicating that all four models perform similarly on this dataset under the chosen hyperparameters.
* The slight differences in CV scores, train accuracy, and test accuracy among the models could be due to the nature of the data and the models' inherent characteristics.
* The consistency in train and test accuracy suggests that the models are not overfitting significantly, which is positive.

3) Hyperparameter Tuning:

* The parameters do not appear to be "over-tuned." The parameter ranges and values selected are reasonable and reflect common practice for balancing model complexity and performance.
* The models' good performance across both training and testing phases, with closely matched CV scores, suggests that the hyperparameter tuning has been effective in identifying robust configurations.
* One key aspect of hyperparameter tuning is ensuring that the model generalizes well to unseen data. The similarity between training and testing accuracy across models indicates successful hyperparameter tuning without overcomplicating the model.

4) Model Selection:
* Given the similar performance of the models, the selection among them might come down to other factors such as interpretability, prediction speed, or specific use-case requirements.
* For instance, Logistic Regression offers good interpretability, SVMs can be effective for higher-dimensional data, XGBoost is known for its performance on structured/tabular data, and Random Forests are useful for their robustness and ease of use.

5) Conclusion:

* The hyperparameter tuning has been conducted thoughtfully to explore a meaningful range of configurations without overburdening computational resources.
* The models have achieved commendable accuracy, with slight variances that could guide model selection based on context-specific priorities.
* Further experiments might include exploring more nuanced hyperparameter spaces, incorporating feature selection or engineering, or applying different evaluation metrics relevant to the task at hand.

##### Determining the Confusion Matrix (using LogReg & SVM)

Rationale: Since they have the closest Train and Test score

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Predict the target values based on the best estimator found by GridSearchCV
y_test_pred = logistic_grid_search.predict(X_test)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display the confusion matrix using Seaborn's heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (Based on Logistic Regression Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(ticks=[0.5, 1.5], labels=['No Heart Disease', 'Heart Disease'])
plt.yticks(ticks=[0.5, 1.5], labels=['No Heart Disease', 'Heart Disease'], rotation=0)
plt.show()

In [None]:
# Predict the target values based on the best estimator found by GridSearchCV
y_test_pred = svm_grid_search.predict(X_test)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display the confusion matrix using Seaborn's heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (Based on SVM Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(ticks=[0.5, 1.5], labels=['No Heart Disease', 'Heart Disease'])
plt.yticks(ticks=[0.5, 1.5], labels=['No Heart Disease', 'Heart Disease'], rotation=0)
plt.show()

#### Creating Polynomial Features (Feature Engineering)

In [None]:
#from sklearn.preprocessing import PolynomialFeatures
#from sklearn.feature_selection import SelectFromModel

# Create interaction / polynomial features
#poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
#X_poly = poly.fit_transform(X_train)

In [None]:
# Logistic Regression Pipeline with SelectFromModel
#logistic_pipeline = Pipeline(steps=[
    #('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    #('feature_selection', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))),
    #('imputer', SimpleImputer(strategy='mean')),
    #('scaler', StandardScaler()),
    #('model', LogisticRegression(max_iter=1000, random_state=42))
#])

# SVM Pipeline with SelectFromModel
#svm_pipeline = Pipeline([
    #('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    #('feature_selection', SelectFromModel(SVC(kernel='linear', C=0.01))),  # Linear kernel and a small C for feature selection
    #('imputer', SimpleImputer(strategy='mean')),
    #('scaler', StandardScaler()),
    #('model', SVC(kernel='rbf', random_state=42))  # Final SVM model
#])

# XGBoost Pipeline with SelectFromModel
#xgb_pipeline = Pipeline([
    #('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    #('feature_selection', SelectFromModel(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))),
    #('imputer', SimpleImputer(strategy='mean')),
    #('model', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))  # Final XGBoost model
#])


# Random Forest Pipeline with SelectFromModel
#rf_pipeline = Pipeline([
    #('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    #('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    #('imputer', SimpleImputer(strategy='mean')),
    #('model', RandomForestClassifier(n_estimators=100, random_state=42))  # Final Random Forest model
#])

In [None]:
#logistic_param_grid = {
    #'feature_selection__estimator__C': [0.01, 0.1, 1, 10, 100], # hyperparameters for the logistic regression model used in SelectFromModel
    #'model__C': [0.01, 0.1, 1, 10, 100],
    #'model__penalty': ['l2'],
    #'model__solver': ['saga'],
    #'model__max_iter': [100, 1000, 5000]
#}

#svm_param_grid = {
    #'model__C': [1, 10],  # Reduced range of C values
    #'model__gamma': ['scale', 0.01],  # Limited gamma to 'scale' and a representative value
    #'model__kernel': ['rbf'],  # Focus on the RBF kernel, often the best choice for SVM
    # Removed 'model__degree': Typically, 'poly' kernel requires more processing time
#}

#xgb_param_grid = {
    #'model__max_depth': [5, 7],  # Focused on mid-range depths
    #'model__n_estimators': [100, 200],  # Reduced the upper range
    #'model__learning_rate': [0.1],  # Chose a commonly effective rate
    #'model__subsample': [0.7, 1.0],  # Limited to higher subsampling for variance reduction
    #'model__colsample_bytree': [0.7],  # Chose a moderate value for feature sampling
    #'model__gamma': [0, 0.1]  # Simplified to two options to evaluate regularization benefit
#}

#rf_param_grid = {
    #'model__n_estimators': [100, 200],  # Reduced number of trees options
    #'model__max_depth': [10, 20],  # Focus on moderate to high depth to control complexity
    #'model__min_samples_split': [2, 5],  # Simplified range for minimum number of samples required to split
    #'model__min_samples_leaf': [1, 2],  # Reduced range for the minimum number of samples required at a leaf node
    #'model__max_features': ['auto'],  # Use the default option for the number of features to consider when looking for the best split
    #'model__bootstrap': [True]  # Keep bootstrapping enabled for better generalization
    # Removed 'model__criterion': Simplification, sticking with the default 'gini' criterion
}

In [None]:
# Logistic Regression GridSearchCV
#logistic_grid_search = GridSearchCV(logistic_pipeline, logistic_param_grid, cv=5, scoring='accuracy')
#logistic_grid_search.fit(X_train, y_train) # No need to transform X_train as pipeline will handle it

# Best parameters and CV score
#print("Best parameters for Logistic Regression:", logistic_grid_search.best_params_)
#print("Best CV score for Logistic Regression:", logistic_grid_search.best_score_)

# Evaluate on training data using the best estimator found by GridSearchCV
#y_train_pred = logistic_grid_search.predict(poly.transform(X_train))
#train_accuracy = accuracy_score(y_train, y_train_pred)
#print("Train accuracy for Logistic Regression:", train_accuracy)

#y_test_pred = logistic_grid_search.predict(poly.transform(X_test))
# Evaluate on test data using the best estimator found by GridSearchCV
#y_test_pred = logistic_grid_search.predict(poly.transform(X_test))
#test_accuracy = accuracy_score(y_test, y_test_pred)
#print("Test accuracy for Logistic Regression:", test_accuracy)

# If you want to report the CV scores detail for the best model, you can do it by accessing cv_results_
# Here's how to get the mean CV score for the best estimator across folds
#best_index1 = logistic_grid_search.best_index_
#mean_cv_score = logistic_grid_search.cv_results_['mean_test_score'][best_index1]
#std_cv_score = logistic_grid_search.cv_results_['std_test_score'][best_index1]
#print(f"Mean CV score for the best Logistic Regression model: {mean_cv_score:.4f} ± {std_cv_score:.4f}")


In [None]:
# XGBoost GridSearchCV
#xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=5, scoring='accuracy')
#xgb_grid_search.fit(X_train, y_train)

# Best parameters and score
#print("Best parameters for XGBoost:", xgb_grid_search.best_params_)
#print("Best score for XGBoost:", xgb_grid_search.best_score_)

# Evaluate on training data using the best estimator found by GridSearchCV
#y_train_pred = xgb_grid_search.predict(X_train)
#train_accuracy = accuracy_score(y_train, y_train_pred)
#print("Train accuracy for XGBoost:", train_accuracy)

# Evaluate on test data using the best estimator found by GridSearchCV
#y_test_pred = xgb_grid_search.predict(X_test)
#test_accuracy = accuracy_score(y_test, y_test_pred)
#print("Test accuracy for XGBoost:", test_accuracy)

# If you want to report the CV scores detail for the best model, you can do it by accessing cv_results_
# Here's how to get the mean CV score for the best estimator across folds
#best_index = xgb_grid_search.best_index_
#mean_cv_score = xgb_grid_search.cv_results_['mean_test_score'][best_index]
#std_cv_score = xgb_grid_search.cv_results_['std_test_score'][best_index]
#print(f"Mean CV score for the best XGBoost model: {mean_cv_score:.4f} ± {std_cv_score:.4f}")


In [None]:
# Random Forest GridSearchCV
#rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='accuracy')
#rf_grid_search.fit(X_train, y_train)

# Best parameters and score
#print("Best parameters for Random Forest:", rf_grid_search.best_params_)
#print("Best score for Random Forest:", rf_grid_search.best_score_)

# Evaluate on training data using the best estimator found by GridSearchCV
#y_train_pred = rf_grid_search.predict(X_train)
#train_accuracy = accuracy_score(y_train, y_train_pred)
#print("Train accuracy for Random Forest:", train_accuracy)

# Evaluate on test data using the best estimator found by GridSearchCV
#y_test_pred = rf_grid_search.predict(X_test)
#test_accuracy = accuracy_score(y_test, y_test_pred)
#print("Test accuracy for Random Forest:", test_accuracy)

# If you want to report the CV scores detail for the best model, you can do it by accessing cv_results_
# Here's how to get the mean CV score for the best estimator across folds
#best_index = rf_grid_search.best_index_
#mean_cv_score = rf_grid_search.cv_results_['mean_test_score'][best_index]
#std_cv_score = rf_grid_search.cv_results_['std_test_score'][best_index]
#print(f"Mean CV score for the best Random Forest model: {mean_cv_score:.4f} ± {std_cv_score:.4f}")


In [None]:
# Confusion matrix values
TN = 4116
FP = 0
FN = 510
TP = 0

# Calculate the total number of predictions
total_predictions = np.sum([TN, FP, FN, TP])

# Calculate the percentages using numpy for consistency
TP_percentage = np.round((TP / total_predictions) * 100, 2)
TN_percentage = np.round((TN / total_predictions) * 100, 2)
FP_percentage = np.round((FP / total_predictions) * 100, 2)
FN_percentage = np.round((FN / total_predictions) * 100, 2)

# Print the results
print(f"True Positives Percentage: {TP_percentage}%")
print(f"True Negatives Percentage: {TN_percentage}%")
print(f"False Positives Percentage: {FP_percentage}%")
print(f"False Negatives Percentage: {FN_percentage}%")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Assuming svm_grid_search and X_test are already defined and svm_grid_search has been fitted
y_pred = logistic_grid_search.best_estimator_.predict(X_test)

# Calculate precision, recall, F1 score, and ROC AUC score
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_pred)  # Ensure y_test and y_pred are appropriately prepared

# Print the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

In [None]:
# Predict probabilities for the positive class
y_pred_prob = logistic_grid_search.best_estimator_.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score using predicted probabilities
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.4f}")