# Case study on flight cancellations

The data source and description: http://stat-computing.org/dataexpo/2009/the-data.html

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from numpy import arang
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error as mse 
from math import sqrt
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from sklearn.metrics import mean_squared_error as mse
from tensorflow.keras.optimizers import RMSprop, Adam, SGD
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score as accuracy
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier

In [None]:
df = pd.read_csv('flights_2008.csv')
df.head()

Drop columns with no variability

In [None]:
df = df.drop(['Unnamed: 0'], axis=1)

In [None]:
df = df.drop(['Year'], axis=1)

#### Will only be focusing on the carrier with most flights, so narrowing the dataset.

In [None]:
df = df.loc[(df['UniqueCarrier']=='WN')]

In [None]:
df['Diverted'].value_counts()

In [None]:
df['Cancelled'].value_counts()

The number of flights diverted for this airline is extremely unbalance, will perform undersampling, oversampling and SMOTE to counteract - due to lack of computational power unable to perform these however would be beneficial to do so.

## Data preprocessing

Missing values:

In [None]:
def missing_val_count_by_column(df):
    data = (df.isnull().sum())
    print(data[data>0])    
missing_val_count_by_column(df)

#### Missing values, imputation and data transformation

We can see that anytime CarrierDelay is null so are all the other delays, implying no delay was recorded. We can change these from NA's to 0.

In [None]:
df['CarrierDelay'].isnull().equals(df['WeatherDelay'].isnull())
df['CarrierDelay'].isnull().equals(df['NASDelay'].isnull())
df['CarrierDelay'].isnull().equals(df['SecurityDelay'].isnull())
df['CarrierDelay'].isnull().equals(df['LateAircraftDelay'].isnull())

In [None]:
df['CarrierDelay'].fillna(0, inplace=True)
df['WeatherDelay'].fillna(0, inplace=True)
df['NASDelay'].fillna(0, inplace=True)
df['SecurityDelay'].fillna(0, inplace=True)
df['LateAircraftDelay'].fillna(0, inplace=True)

Ensure it worked:

In [None]:
missing_val_count_by_column(df)

Should arrival Time be imputed?

In [None]:
df_no_arrivals = df.loc[df['ArrTime'].isnull()]
df_no_arrivals.iloc[:15,3:18]

In [None]:
df_no_arrivals['Diverted'].value_counts()

In [None]:
df_no_arrivals['Cancelled'].value_counts()

In [None]:
df_no_arrivals.loc[(df_no_arrivals['Diverted']==1) & (df_no_arrivals['Cancelled'] == 0), ['Diverted', 'Cancelled']]

In [None]:
df.loc[(df['Diverted']==1) & (df['ArrTime'] > 0), ['FlightNum','Diverted', 'ArrTime', 'CRSArrTime', 'Origin','Dest','UniqueCarrier', 'ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay','NASDelay','WeatherDelay', 'SecurityDelay', 'LateAircraftDelay' ]]

Formulas are: 
<ul>
    <li><b>ArrDelay</b> = ArrTime - CRSArrTime</li>
    <li><b>ActualElapsedTime</b> = ArrTime - DepTime</li>
    <li><b>AirTime</b> = ActualElapsedTime - (TaxiIn + TaxiOut)</li>
</ul>
To get the ArrDelay I will first make a two date columns which have arrival and departure date.

New dataframe for diverted flights with an arrival time

In [None]:
df_arrdelay = df.loc[(df['Diverted']==1) & (df['ArrTime'] > 0), [ 'ArrTime', 'CRSArrTime', 'ArrDelay', 'DepTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'TaxiOut']]

In [None]:
missing_val_count_by_column(df_arrdelay)

Will impute 190 values.

In [None]:
def makeFloatToStr(df, column):
    df[column] = df[column].astype(int)
    df[column] = df[column].astype(str)

In [None]:
makeFloatToStr(df_arrdelay,'CRSArrTime')
makeFloatToStr(df_arrdelay,'ArrTime')
makeFloatToStr(df_arrdelay,'DepTime')

Make strings have 4 characters, such that any time value with 3 gets an extra zero in front to become 4.

In [None]:
def standariseTime(df,column):
    df[column] = df[column].where(df[column].str.len().ne(3),df[column].astype(str).str.zfill(4))
    df[column] = df[column].where(df[column].str.len().ne(2),df[column].astype(str).str.zfill(4))
    df[column] = df[column].where(df[column].str.len().ne(1),df[column].astype(str).str.zfill(4))

In [None]:
standariseTime(df_arrdelay, 'ArrTime')
standariseTime(df_arrdelay, 'CRSArrTime')
standariseTime(df_arrdelay, 'DepTime')

#### We now change the ArrTime and CRSArrTime to times, then make the arrival time one day later if its lower than expected arrival time.

In [None]:
def makeTimeToDatetime(df, column):
    df[column] = pd.to_datetime(df[column],format= '%H%M', errors='coerce')

In [None]:
makeTimeToDatetime(df_arrdelay,'ArrTime')
makeTimeToDatetime(df_arrdelay,'CRSArrTime')

If a flight arrived at a time that is before the expected arrival time, it implies the flight arrived the following day; hence add a day to the Arrival Time.

In [None]:
def compareDates(df,actual,expected):
    for i in range(len(df)):
        if (df[actual].iloc[i] < df[expected].iloc[i]):
            df[actual].iloc[i] = df[actual].iloc[i] + pd.to_timedelta(1,unit='d')
    return df

In [None]:
compareDates(df_arrdelay,'ArrTime','CRSArrTime')

Now I get the difference between ArrTime and expected arrival time and input into new colum 'string'

In [None]:
def timeDifference(df,column1, column2, diffTime, diffString):
    df[diffTime] = df[column1] - df[column2]
    df[diffString] = df[diffTime].dt.seconds/60

In [None]:
timeDifference(df_arrdelay, 'ArrTime', 'CRSArrTime','ArrDelay', 'string')

#### Now I will repeat the process but to find ActualElapsedTime (arrtime - deptime)

<ol>
    <li>Make dep time into a timestamp </li>
    <li>Subtract arrtime from deptime.</li>
    <li>Make a new column with the string value in minutes.</li>
</ol>

In [None]:
makeTimeToDatetime(df_arrdelay,'DepTime')
timeDifference(df_arrdelay, 'ArrTime', 'DepTime','ActualElapsedTime', 'string2')

Calculate AirTime

In [None]:
df_arrdelay['AirTime2'] = df_arrdelay['string2'] - (df_arrdelay['TaxiIn']+df_arrdelay['TaxiOut'])

Add Arrival Delay and Actual Elapsed Time to the main dataset - Expected to decrease missing values for ArrDelay and ActualElapsed time from 1401 to 1211

In [None]:
df = df.join(df_arrdelay['string'])
df = df.join(df_arrdelay['string2'])
df = df.join(df_arrdelay['AirTime2'])

In [None]:
print(df['ArrDelay'].isnull().sum())
print(df['ActualElapsedTime'].isnull().sum())
print(df['AirTime'].isnull().sum())

In [None]:
def replaceNewValues(df,replaceTo, replaceFrom):
    df[replaceTo] = np.where(df[replaceTo].isnull(), df[replaceFrom] ,df[replaceTo])

In [None]:
replaceNewValues(df,'ArrDelay', 'string')
replaceNewValues(df,'ActualElapsedTime', 'string2')
replaceNewValues(df,'AirTime', 'AirTime2')

df = df.drop(['string'], axis=1)
df = df.drop(['string2'], axis=1)
df = df.drop(['AirTime2'], axis=1)

In [None]:
df['ArrDelay'].describe()

#### Reassess arrival delay variable

In [None]:
df['ArrDelay'].isnull().sum()

In [None]:
missing_val_count_by_column(df)

In [None]:
df.head()

#### Save dataframe to use for binary classification of cancelled flights

In [None]:
#df.to_csv('Flight_Data_Cancellation.csv')

Given that flights that don't have a time of arrival are either cancelled or diverted. It is safe to assume that they did not make it to the final destination, hence cannot use for delay. (Perhaps they should count as cancelled). For this reason will remove these rows.

In [None]:
to_keep = pd.notnull(df["ArrDelay"])
df = df.loc[to_keep,:].copy()

In [None]:
df['ArrDelay'].isnull().sum()

No more missing values are in the data.
As we are interested in forecasting future delays, we must remove all the columns that realistically are impossible to know prior to a flight taking off. We will assume that the information the airline has is the day before the flight departs implying ex-ante forecast.

In [None]:
def remove_columns(dataset, to_remove):
    purchase_data = dataset.copy()  
    purchase_data = dataset.drop(to_remove, axis=1)
    return purchase_data

to_remove = ['UniqueCarrier', 'DepTime','DepDelay','TaxiOut', 'ActualElapsedTime', 'AirTime', 'ArrTime', 'TaxiIn', 'Cancelled', 'CancellationCode', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Diverted']
df = remove_columns(df, to_remove)

Convert CRSDepTime and CRSArrTime to Hour and make into bins

In [None]:
def convert_to_hour(df,column_list):
    for i in column_list:
        df[i] -= df[i] % -100   
        df[i] = df[i]/100
    return df

In [None]:
time_to_hours = ['CRSDepTime', 'CRSArrTime']
convert_to_hour(df, time_to_hours)

In [None]:
df['CRSDepTime'] = pd.qcut(df['CRSDepTime'], 8)
df['CRSDepTime'].value_counts()

In [None]:
df['CRSArrTime'] = pd.qcut(df['CRSArrTime'], 8)
df['CRSArrTime'].value_counts()

Convert TailNum into FleetNum - as per https://www.planespotters.net/airline/Southwest-Airlines fleets are numbered, so higher the fleet the younger it is.

In [None]:
df['FleetNum'] = df['TailNum'].str.replace(r'\D', '').astype(int)

Will drop Flight number becasue it won't help generalise results. Will use FleetNum instead of TailNum as a signal of age of fleet/plane. Drop dayofmonth as flights repeat weekly.

In [None]:
df = df.drop(['FlightNum', 'TailNum', 'DayofMonth'], axis=1)

Make Dummies

In [None]:
def make_dummies(df,column,prefix):
    toMerge= pd.get_dummies(df[column], prefix=prefix)
    df = df.join(toMerge)
    df = df.drop([column], axis=1)
    return df

In [None]:
df = make_dummies(df,'Origin','Origin_')

In [None]:
df = make_dummies(df,'Month','Month_')

In [None]:
df = make_dummies(df,'Dest','Dest_')

In [None]:
df = make_dummies(df,'DayOfWeek','Day_')

In [None]:
df = make_dummies(df,'CRSDepTime','CRSDepTime_')

In [None]:
df = make_dummies(df,'CRSArrTime','CRSArrTime_')

In [None]:
df['Delayed'] = np.where(df['ArrDelay']>15, 1,0)
df.to_csv('Flight_Data_Delay_Classification.csv')
df = df.drop(['Delayed'], axis=1)

Remove outliers by interquartile method - done to generalise results 

In [None]:
Q1 = df['ArrDelay'].quantile(0.25)
Q3 = df['ArrDelay'].quantile(0.75)
iqr = Q3 - Q1

In [None]:
upper_limit = Q3 + 1.5 * iqr
lower_limit = Q1 - 1.5 * iqr

In [None]:
df_clean = df[(df['ArrDelay'] < upper_limit) & (df['ArrDelay'] > lower_limit)]
print(df_clean.shape)
print(df.shape)

# Split the data + Normalisation

In [None]:
from sklearn.model_selection import train_test_split
y = df_clean['ArrDelay']
X = df_clean.drop('ArrDelay',axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def standarise_variable(dataset, variables):
    normalised_data = dataset.copy()
    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    norm_data = ss.fit_transform(normalised_data[variables].values)
    normalised_data[variables] = norm_data
    return normalised_data

In [None]:
to_normalise_x = ['CRSElapsedTime', 'Distance', 'FleetNum',]   

In [None]:
X_test_standarized = standarise_variable(X_test, to_normalise_x)

In [None]:
X_train_standarized = standarise_variable(X_train, to_normalise_x)

# Modelling

Lasso Regression with Cross Validation

In [None]:
ols = LinearRegression().fit(X_train_standarized, y_train)
predictions_ols = ols.predict(X_test_standarized)
rmse_ols = str(sqrt(mse(y_test,predictions_ols)))
lasso = Lasso().fit(X_train_standarized,y_train)
prediction_lasso = lasso.predict(X_test_standarized)
rmse_lasso = str(sqrt(mse(y_test,prediction_lasso)))
print('RMSE OLS : ',rmse_ols)
print('RMSE Lasso : ', rmse_lasso)

Ridge Regression

In [None]:
alphas = 10**np.linspace(10,-10,10)*0.5
ridge_cv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', normalize = False)
ridge_cv.fit(X_train_standarized, y_train)
ridge_cv.alpha_

In [None]:
ridge4 = Ridge(alpha = ridge_cv.alpha_, normalize = False)
ridge4.fit(X_train_standarized, y_train)

In [None]:
print('RMSE:', np.sqrt(mse(y_test, (ridge4.predict(X_test_standarized)))))

Principle Component Regression

In [None]:
pca = PCA()
X_reduced = pca.fit_transform(X_train)
cv = KFold(3)
regr = LinearRegression()
mse = []
score = -1*model_selection.cross_val_score(regr,
           np.ones((len(X_reduced),1)), y_train, cv=cv,
           scoring='neg_mean_squared_error').mean()    
mse.append(score)
for i in np.arange(1,136,1):
    score = -1*model_selection.cross_val_score(regr,
               X_reduced[:,:i], y_train, cv=cv, scoring='neg_mean_squared_error').mean()
    mse.append(score)  

In [None]:
plt.plot(mse)
plt.xlabel('Number of Principal Components')
plt.ylabel('MSE')
plt.title('Cross-validation MSE')

In [None]:
pca = PCA()
X_reduced_train = pca.fit_transform(X_train)
X_reduced_test = pca.transform(X_test)[:,:50]
regr = LinearRegression()
regr.fit(X_reduced_train[:,:50], y_train)
pred = regr.predict(X_reduced_test)
print(np.sqrt(mse(y_test, pred)))
print(pca.singular_values_)

In [None]:
X_reduced_train = pca.fit_transform(X_train)
X_reduced_test = pca.transform(X_test)[:,:10]
regr = Ridge()
regr.fit(X_reduced_train[:,:10], y_train)
pred = regr.predict(X_reduced_test)
np.sqrt(mean_squared_error(y_test, pred))

In [None]:
X_reduced_train = pca.fit_transform(X_train)
X_reduced_test = pca.transform(X_test)[:,:10]
regr = Lasso()
regr.fit(X_reduced_train[:,:10], y_train)
pred = regr.predict(X_reduced_test)
np.sqrt(mean_squared_error(y_test, pred))

Random Forest Regression

In [None]:
parameters = {'min_samples_leaf':[1,20,50],'max_depth':[None,10,100]}
grid_search = GridSearchCV(RandomForestRegressor(n_estimators=10), parameters, cv=3)
grid_search.fit(X_train_standarized, y_train.values.ravel())
prediction = grid_search.predict(X_test_standarized)
best_classifier = grid_search.best_estimator_
print('Best classifier:',best_classifier)
np.sqrt(mean_squared_error(y_test, prediction))

In [None]:
rf = RandomForestRegressor(n_estimators=50)
rf.fit(X_train_standarized, y_train)
prediction = rf.predict(X_test_standarized)
np.sqrt(mean_squared_error(y_test, prediction))

In [None]:
input_dim = X_train_standarized.shape[1]
output_dim = 1
model = Sequential()
model.add(Dense(50,input_dim=input_dim))
model.add(Dense(output_dim))
model.compile(optimizer=Adam(),loss='mean_squared_error',metrics=['mean_squared_error'])
model.summary()
model.fit(X_train_standarized,y_train,epochs=10)
prediction = model.predict(X_test_standarized)
print('RMSE:', np.sqrt(mse(y_test,prediction)))

### Delay Classification

In [None]:
df = pd.read_csv('Flight_Data_Delay_Classification.csv')
df = remove_columns(df,['ArrDelay', 'Unnamed: 0'])
y = df['Delayed']
X = df.drop('Delayed',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [None]:
to_normalise_x = ['CRSElapsedTime', 'Distance', 'FleetNum'] 
X_test_standarized = standarise_variable(X_test, to_normalise_x)
X_train_standarized = standarise_variable(X_train, to_normalise_x)

Logistic Model

In [None]:
logictic_model = LogisticRegression(solver='liblinear').fit(X_train_standarized,y_train)
predictions_non_tuned_model = logictic_model.predict(X_test_standarized)

In [None]:
print("Accuracy traditional model: "+str(accuracy(y_test,predictions_non_tuned_model)))
print('AUC:',roc_auc_score(y_test,predictions_non_tuned_model))
print('Recall Score:',recall_score(y_test,predictions_non_tuned_model))

cm_logi = confusion_matrix(y_test, predictions_non_tuned_model)
ConfusionMatrixDisplay(cm_logi).plot()

AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_standarized,y_train.values.ravel())
prediction_ada = ada.predict(X_test_standarized)
prediction_prob = ada.predict_proba(X_test_standarized)
print('Accuracy:', accuracy_score(y_test,prediction_ada))
print('AUC:',roc_auc_score(y_test,prediction_ada))
print('Recall Score:',recall_score(y_test,prediction_ada))

cm_ada = confusion_matrix(y_test, prediction_ada)
ConfusionMatrixDisplay(cm_ada).plot()

In [None]:
print(ada.get_params())

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier( n_estimators=100)
ada.fit(X_train_standarized,y_train.values.ravel())
prediction_ada = ada.predict(X_test_standarized)
prediction_prob = ada.predict_proba(X_test_standarized)
print('Accuracy:', accuracy_score(y_test,prediction_ada))
print('AUC:',roc_auc_score(y_test,prediction_ada))
print('Recall Score:',recall_score(y_test,prediction_ada))

cm_ada = confusion_matrix(y_test, prediction_ada)
ConfusionMatrixDisplay(cm_ada).plot()

In [None]:
print(ada.get_params())

In [None]:
for c, column in enumerate(X_test.columns):
    if ada.feature_importances_[c] in sorted(ada.feature_importances_)[-5:]:
        print('Variable',column,ada.feature_importances_[c])

AdaBoost Hyperparameter tuning

In [None]:
# define the model with default hyperparameters
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50,75,100,150]
# define the evaluation procedure
cv = KFold(3)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cv, scoring='recall')
# execute the grid search
grid_result = grid_search.fit(X_train_standarized, y_train)
prediction = grid_result.predict(X_test_standarized)
best_classifier = grid_search.best_estimator_
print('Best classifier:',best_classifier)
print(np.sqrt(mean_squared_error(y_test, prediction)))
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_postcv = AdaBoostClassifier(n_estimators=75)
ada_postcv.fit(X_train_standarized,y_train.values.ravel())
prediction_adapostcv = ada_postcv.predict(X_test_standarized)
prediction_probpostcv = ada_postcv.predict_proba(X_test_standarized)
print('Accuracy:', accuracy_score(y_test,prediction_adapostcv))
print('AUC:',roc_auc_score(y_test,prediction_adapostcv))
print('Recall Score:',recall_score(y_test,prediction_adapostcv))

cm_adapostcv = confusion_matrix(y_test, prediction_adapostcv)
ConfusionMatrixDisplay(cm_adapostcv).plot()

Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_standarized,y_train.values.ravel())
prediction_random_forest = rf.predict(X_test_standarized)
print('Accuracy:', accuracy_score(y_test,prediction_random_forest))
print('AUC:',roc_auc_score(y_test,prediction_random_forest))
print('Recall Score:',recall_score(y_test,prediction_random_forest))

In [None]:
cm_rf = confusion_matrix(y_test, prediction_random_forest)
ConfusionMatrixDisplay(cm_rf).plot()

In [None]:
for c, column in enumerate(X_test.columns):
    if rf.feature_importances_[c] in sorted(rf.feature_importances_)[-5:]:
        print('Variable',column,rf.feature_importances_[c])

### Cancellation Classification

In [None]:
df = pd.read_csv('Flight_Data_Cancellation.csv')

In [None]:
#performed same data cleaning and transformations as above.
to_remove = ['Unnamed: 0','UniqueCarrier', 'ArrDelay', 'DepTime','DepDelay','TaxiOut', 'ActualElapsedTime', 'AirTime', 'ArrTime', 'TaxiIn', 'CancellationCode', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Diverted']
df = remove_columns(df, to_remove)
convert_to_hour(df, time_to_hours)
df['CRSDepTime'] = pd.qcut(df['CRSDepTime'], 8)
df['CRSArrTime'] = pd.qcut(df['CRSArrTime'], 8)
df['FleetNum'] = df['TailNum'].str.replace(r'\D', '').astype(int)
df = df.drop(['FlightNum', 'TailNum', 'DayofMonth'], axis=1)
df = make_dummies(df,'Origin','Origin_')
df = make_dummies(df,'Month','Month_')
df = make_dummies(df,'Dest','Dest_')
df = make_dummies(df,'DayOfWeek','Day_')
df = make_dummies(df,'CRSDepTime','CRSDepTime_')
df = make_dummies(df,'CRSArrTime','CRSArrTime_')

In [None]:
y = df['Cancelled']
X = df.drop('Cancelled',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [None]:
to_normalise_x = ['CRSElapsedTime', 'Distance', 'FleetNum'] 
X_test_standarized = standarise_variable(X_test, to_normalise_x)
X_train_standarized = standarise_variable(X_train, to_normalise_x)

Logistic Model

In [None]:
logictic_model = LogisticRegression(solver='liblinear').fit(X_train_standarized,y_train)
predictions_non_tuned_model = logictic_model.predict(X_test_standarized)

In [None]:
print("Accuracy traditional model: "+str(accuracy(y_test,predictions_non_tuned_model)))
print('AUC:',roc_auc_score(y_test,predictions_non_tuned_model))
print('Recall Score:',recall_score(y_test,predictions_non_tuned_model))

cm_logi = confusion_matrix(y_test, predictions_non_tuned_model)
ConfusionMatrixDisplay(cm_logi).plot()

AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(X_train_standarized,y_train.values.ravel())
prediction_ada = ada.predict(X_test_standarized)
prediction_prob = ada.predict_proba(X_test_standarized)
print('Accuracy:', accuracy_score(y_test,prediction_ada))
print('AUC:',roc_auc_score(y_test,prediction_ada))
print('Recall Score:',recall_score(y_test,prediction_ada))

In [None]:
cm_ada = confusion_matrix(y_test, prediction_ada)
ConfusionMatrixDisplay(cm_ada).plot()

Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_standarized,y_train.values.ravel())
prediction_random_forest = rf.predict(X_test_standarized)
print('Accuracy:', accuracy_score(y_test,prediction_random_forest))
print('AUC:',roc_auc_score(y_test,prediction_random_forest))
print('Recall Score:',recall_score(y_test,prediction_random_forest))

In [None]:
cm_rf = confusion_matrix(y_test, prediction_random_forest)
ConfusionMatrixDisplay(cm_rf).plot()