In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, log_loss # Log_loss is the cost function

In [3]:
# Load the data
dataset = pd.read_csv('/content/sample_data/Lab 2 Dataset.csv')

In [4]:
# Data Preprocessing
print(dataset.head()) # this is will out put the first 6 data row so that we can observe what the dataset looks like
print(dataset.shape[1])  # this will tell the number of columns I have in my dataset
print(dataset.shape[0])  # this will tell the number of rows I have in my dataset
print(dataset.shape)     # this will give us the shape of our data (rows by columns)


    # Normalization vs. Standardization:
    #     Normalization: Rescales the features to a range between 0 and 1. It's suitable when the features have different scales and you want to preserve the distribution of the data.
    #     Standardization: Centers the feature columns at mean 0 with standard deviation 1. It's suitable when the features have different units of measurement and you want to remove the mean and scale the data to unit variance.

    # Feature Selection:
    #     Analyze the importance of features using techniques like correlation analysis, feature importance from tree-based models, or recursive feature elimination (RFE).
    #     Consider dropping irrelevant or redundant features that don't contribute significantly to the predictive power of the model.

    # Handling Missing Values:
    #     Decide on strategies to handle missing values, such as imputation, removal of rows or columns with missing values, or using models that can handle missing values internally.

    # Outlier Detection and Treatment:
    #     Identify outliers using visualization techniques like box plots or scatter plots.
    #     Decide whether to remove outliers, transform them, or leave them as is based on domain knowledge and the impact on model performance.

    # Dimensionality Reduction:
    #     Apply dimensionality reduction techniques like Principal Component Analysis (PCA) or t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the number of features while preserving the most important information.
    #     Dimensionality reduction can help mitigate the curse of dimensionality and improve model efficiency.

    # Other Preprocessing Steps:
    #     Consider other preprocessing steps such as handling skewness in numerical features (e.g., log transformation), encoding datetime features, or creating interaction terms for feature engineering.

    # Validation and Iteration:
    #     Validate the impact of preprocessing steps on model performance using cross-validation or holdout validation.
    #     Iterate on preprocessing choices based on the results of model evaluation and fine-tune preprocessing parameters if necessary.

  Customer       State  Customer Lifetime Value Response  Coverage Education  \
0  FZ30935      Oregon              6066.115969       No  Extended   College   
1  UG93476  California              8002.308333      Yes     Basic   College   
2  AB96670  California              2393.915369      Yes     Basic   College   
3  XK64261      Oregon              4762.817900       No     Basic  Bachelor   
4  EV68375  California              4330.386020      Yes   Premium   College   

  Effective To Date EmploymentStatus Gender  Income  ...  \
0           2/25/11         Employed      M   32627  ...   
1           1/11/11       Unemployed      F       0  ...   
2           1/31/11       Unemployed      M       0  ...   
3           1/12/11         Employed      M   65795  ...   
4           2/16/11         Employed      M   60475  ...   

  Months Since Policy Inception Number of Open Complaints  Number of Policies  \
0                            60                         0                   7

In [5]:
# How many of my columns are categorical?
print(dataset.select_dtypes(include=['object']).columns.size)

16


In [6]:
# How many of my columns are none-categorical?
print(dataset.select_dtypes(exclude=['object']).columns.size)

8


In [7]:
# Convert categorical variables to numeric variables
label_encode = LabelEncoder()
categorical_cols = dataset.select_dtypes(include=['object']).columns
dataset[categorical_cols] = dataset[categorical_cols].apply(lambda col: label_encode.fit_transform(col))

In [8]:
# Separate features and target
X = dataset.drop('Response', axis=1)
y = dataset['Response']

In [9]:
# Split data into training, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

In [None]:
# Initialize and fit the model on the training set
# Naive Bayes - GaussianNB
# model = GaussianNB()
# Logistic Regression
# model = LogisticRegressionCV()
# generalized_linear_model
# model = SGDClassifier(loss='log', random_state=42)
# Deep Learning ?
# KNeighborsClassifier
# model = KNeighborsClassifier(n_neighbors=4)
# LinearDiscriminantAnalysis
# model = LinearDiscriminantAnalysis()
# QuadraticDiscriminantAnalysis
# model = QuadraticDiscriminantAnalysis()
# GaussianProcessClassifier
# Initialize and train the GaussianProcessClassifier model
kernel = 1.0 * RBF(1.0)
model = GaussianProcessClassifier(kernel=kernel, random_state=42)
# AdaBoostClassifier
# model = AdaBoostClassifier(n_estimators=50, random_state=42)
# DecisionTreeClassifier
# model = DecisionTreeClassifier(random_state=42)
# XGBClassifier
# model = XGBClassifier(random_state=42)

model.fit(X_train, y_train)




In [51]:
# Make predictions on the training set and calculate loss
y_pred_train = model.predict(X_train)
y_pred_train_proba = model.predict_proba(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_loss = log_loss(y_train, y_pred_train_proba)

In [52]:
# Make predictions on the validation set and calculate loss
y_pred_val = model.predict(X_val)
y_pred_val_proba = model.predict_proba(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_val_proba)

In [53]:
# Print training and validation loss and accuracy
print("Training Loss: %.2f" % train_loss)
print("Training Accuracy: %.2f%%" % (train_accuracy * 100.0))
print("Validation Loss: %.2f" % val_loss)
print("Validation Accuracy: %.2f%%" % (val_accuracy * 100.0))

Training Loss: 0.31
Training Accuracy: 86.75%
Validation Loss: 0.37
Validation Accuracy: 83.67%


In [54]:
# Print precision, recall, and F1 score for the validation set
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91      1007
           1       0.49      0.28      0.36       193

    accuracy                           0.84      1200
   macro avg       0.68      0.61      0.63      1200
weighted avg       0.81      0.84      0.82      1200



In [100]:
# Perform Preprocessing and Data Cleaning operations then run the models and compare
# the performance before and after cleaning.

# How do we do the data preprocessing