In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Logit
from sklearn.model_selection import train_test_split, cross_validate


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, roc_curve, roc_auc_score
from scipy.stats import chisquare, ttest_ind, pearsonr

from sklearn.ensemble import GradientBoostingClassifier


  from numpy.core.umath_tests import inner1d


In [2]:
data= pd.read_excel("satisfaction_2015.xlsx")

In [None]:
data.info()

In [None]:
plt.figure(figsize=(9, 7)) #make the graph bigger
sns.heatmap(data.corr().abs().round(2), annot=True)
plt.show()

In [None]:
sns.countplot(data.satisfaction_v2)
plt.show()
print((data.satisfaction_v2.value_counts()/len(data)*100).round(2))

In [None]:
#We should drop the correlated variable of Arrival_delay_in_minutes
data= data.drop(['Arrival_Delay_in_Minutes'],axis=1)

In [None]:
plt.figure(figsize=(9, 7)) #make the graph bigger
sns.heatmap(data.corr().abs().round(2), annot=True)
plt.show()

In [None]:
data.isnull().any()

In [None]:
data['Departure_Delay_in_Minutes'].isnull().any()

In [None]:
print(data['Departure_Delay_in_Minutes'].value_counts())  #len(data_final)*100)

In [None]:
data.info()

In [None]:
data_numeric = data.select_dtypes(exclude="object")
data_object = data.select_dtypes(include="object")

In [None]:
data_numeric= data_numeric.drop(['id'],axis=1)

In [None]:
data_numeric.describe()

In [None]:
data_numeric.var()==0

In [None]:
for i in data_object.columns:
    print(data_object[i].unique())

In [None]:
data_dummy = pd.get_dummies(data_object,prefix=data_object.columns,drop_first=True)

In [None]:
data_dummy.head()

In [None]:
data_final=data_numeric.join(data_dummy)

In [None]:
print(plt.boxplot(data.Departure_Delay_in_Minutes))

plt.show()

In [None]:
for i in data_final.columns:
    plt.hist(data_final[i])
    plt.title(i)
    plt.show()

In [None]:
ttest_ind(data['ease_of_online_booking'], data_final['inflight_wifi_service'])

## Logistic Regression

In [None]:
data_final.head()

In [None]:
data_final.satisfaction_v2_satisfied.value_counts()/len(data_final)*100

56% of people were satisfied; if we randomly chose a passenger we could state with 56% that we would be satisfied.

In [None]:
data_final = data_final.drop('Departure_Delay_in_Minutes', axis = 1)

In [None]:
Y=data_final.satisfaction_v2_satisfied
X=data_final.drop("satisfaction_v2_satisfied",axis=1)
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

In [None]:
formula='Y~'+'+'.join(X.columns.tolist())
formula

In [None]:
logit=Logit.from_formula(formula=formula, data=data_final).fit()

In [None]:
logit.summary()

Some significant variables are:
Age,Inflight wifi service, Departure/Arrical time convenience, ease of online booking, 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:

grid_search=GridSearchCV(LogisticRegression(random_state=42,class_weight="balanced"),param_grid={"C":np.linspace(0.0001,1,25),"penalty":
                                                                                                ["l1","l2"]},scoring="roc_auc",cv=5,n_jobs=-1)
grid_search.fit(X,Y)

In [None]:
print(grid_search.best_params_)
print("Average roc_auc",grid_search.best_score_)

In [None]:
logit1=LogisticRegression(C= 1,penalty='l2', random_state=42,class_weight="balanced")
logit1.fit(x_train,y_train)

In [None]:
print("Train:",logit1.score(x_train,y_train).round(2)*100)
print("Test:",logit1.score(x_test,y_test).round(2)*100)
ypred5=logit1.predict(x_train)
ypred6=logit1.predict(x_test)
print('Recall for RF 1:',recall_score(y_train,ypred5).round(2)*100)
print("Recall for RF 1:",recall_score(y_test,ypred6).round(2)*100)
print('roc_auc:1',roc_auc_score(y_train,ypred5).round(2)*100)
print('roc_auc:1',roc_auc_score(y_test,ypred6).round(2)*100)

In [None]:
logit_sk=LogisticRegression()

In [None]:
logit_sk=logit_sk.fit(x_train,y_train)

In [None]:
logit_sk.score(x_test,y_test)



In [None]:
logit_sk.score(x_train,y_train)

model didn't overfit

In [None]:
y_pred = logit_sk.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print('LOGISTIC MODEL:')
print('Train Score')
print(logit_sk.score(x_train,y_train))
print('Test Score')
print(logit_sk.score(x_test,y_test))
print('Confusion Matrix')
print(cm)
print('-------------------')
print('Precision Recall F1 Support')
print(classification_report(y_test, y_pred))
print('-------------------')
print('ROC AUC Score')
print(roc_auc_score(y_test, y_pred))

In [None]:
FPR,TPR,tresholds=roc_curve(y_test,logit_sk.predict_proba(x_test)[:,1])
plt.plot(FPR,TPR,label="ROC AUC=%f"%roc_auc_score(y_test,y_pred))
plt.plot([0,1],[0,1])
plt.legend(loc="lower right")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()


Logistic resgression id significantly better that our benchmark of 56%.
Logistic regression has 84% of accuracy. 