# Midterm

## Data

In [43]:
#| echo: False
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, matthews_corrcoef, r2_score
from sklearn.ensemble import RandomForestClassifier

In [15]:
data1 = pd.read_csv("/Users/ben/Documents/GitHub/AdvancedMachineLearning/Data/MidtermData/data1.csv")
data1 = data1.drop("Unnamed: 0", axis = 1)
data2 = pd.read_csv("/Users/ben/Documents/GitHub/AdvancedMachineLearning/Data/MidtermData/data2.csv")
data2 = data2.drop("Unnamed: 0", axis = 1)
merged_data = pd.concat([data1, data2])

In [22]:
merged_data.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [33]:
merged_data.describe()

Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129487.0
mean,64940.5,39.427957,1190.316392,2.728696,3.057599,2.756876,2.976925,3.204774,3.252633,3.441361,3.358077,3.383023,3.350878,3.632114,3.306267,3.642193,3.286326,14.713713,15.091129
std,37493.270818,15.11936,997.452477,1.32934,1.526741,1.40174,1.27852,1.329933,1.350719,1.319289,1.334049,1.287099,1.316252,1.180025,1.266185,1.176669,1.313682,38.071126,38.46565
min,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,32470.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,64940.5,40.0,844.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,97410.25,51.0,1744.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [34]:
merged_data["Gender"].value_counts()

Female    65899
Male      63981
Name: Gender, dtype: int64

In [35]:
merged_data["Customer Type"].value_counts()

Loyal Customer       106100
disloyal Customer     23780
Name: Customer Type, dtype: int64

In [36]:
merged_data["Type of Travel"].value_counts()

Business travel    89693
Personal Travel    40187
Name: Type of Travel, dtype: int64

In [37]:
merged_data["Class"].value_counts()

Business    62160
Eco         58309
Eco Plus     9411
Name: Class, dtype: int64

In [38]:
merged_data["satisfaction"].value_counts()

neutral or dissatisfied    73452
satisfied                  56428
Name: satisfaction, dtype: int64

After examining the data, there are no missing data and the classes are not heavily imbalanced. This is good clean data to work with and it will not require a large amount of preprocessing for training. 

In [47]:
X = merged_data.drop(["id", "satisfaction"], axis = 1)
y = merged_data["satisfaction"]
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Modeling

To begin our modeling efforts, we will start with a relatively simple model and use a random forest to set a baseline value.

In [50]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown="error", drop = "first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

In [51]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier())
    ]
)

parameters = {
    "forest__min_samples_leaf": [1, 5, 7, 10, 50, 100],
    "forest__min_samples_split": [2, 5, 7, 10, 50, 100],
    "forest__ccp_alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0],
    "forest__n_estimators": [10, 50, 100, 250]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs = -1, verbose = 1)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits



KeyboardInterrupt

