## Use pipeline to find optimal model

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Filter warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.pipeline import Pipeline
from tempfile import mkdtemp
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix,
                             plot_roc_curve, roc_auc_score)

In [30]:
df = pd.read_csv('../data/very_train_cleaned_airlines.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('arrival_delay_in_minutes', axis=1)

# Below I am setting our train dataset target y to 'rating'
y = df['satisfaction_target_Satisfied']

# Below I am setting our train dataset data to every column besides 'rating'
X = df.drop('satisfaction_target_Satisfied', axis = 1)

estimators = [('normalise', StandardScaler()),
              ('reduce_dim', PCA()),
              ('log', LogisticRegression())]

cachedir = mkdtemp()
pipe = Pipeline(estimators, memory = cachedir)

params = {'log__C': [0.1, 1, 3, 5, 7, 10], 
          'reduce_dim__n_components': [None, 2, 5, 10]}

grid_search = GridSearchCV(pipe, param_grid=params)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, stratify=y, random_state=0)

fitted_search = grid_search.fit(X_train, y_train)

print(f"The fitted search score is {fitted_search.score(X_test, y_test)}")
print(f"The best parameters for the model are {fitted_search.best_params_}")

The fitted search score is 0.8715737603942101
The best parameters for the model are {'log__C': 1, 'reduce_dim__n_components': None}


## Run code seperately

In [35]:
# scale the data
scaler = StandardScaler()

# fit on train set
X_train_scaled = scaler.fit_transform(X_train)

# use it to transform test set
X_test_scaled = scaler.transform(X_test)

print(f"Scaled train shape is {X_train_scaled.shape} and scaled test shape is {X_test_scaled.shape}")

Scaled train shape is (90916, 22) and scaled test shape is (38964, 22)


In [39]:
# Below I am instantiating the model
LogReg = LogisticRegression(C=1,random_state=0)

# Below I am fitting the logistic regression model to the training data
LogReg.fit(X_train_scaled, y_train)

print(f"The train score is {LogReg.score(X_train_scaled, y_train)} and the test score is {LogReg.score(X_test_scaled, y_test)}.")

The train score is 0.8754344669805095 and the test score is 0.8715737603942101.


There is a marginal amount of overfiting but both scores are similar.

## Classification report & Confusion matrix

In [45]:
y_pred. = LogReg.predict(X_test_scaled)

SyntaxError: invalid syntax (106013668.py, line 1)

In [43]:
print(classification_report(y_test, X_test))

ValueError: Classification metrics can't handle a mix of binary and multiclass-multioutput targets