<a href="https://colab.research.google.com/github/nadairshaid/big-pandas-MIT/blob/decision_tree/Decision_Tree_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing and Mounting**

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# Setting the precision of floating numbers to 5 decimal points
pd.set_option("display.float_format", lambda x: "%.5f" % x)

# To tune model, get different metric scores, and split data
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
    classification_report,
    precision_recall_curve
)
from sklearn import metrics

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression

# To build classification models 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


# For tuning the model
from sklearn.model_selection import GridSearchCV

# To supress warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Importing data
survey_train = pd.read_csv('/content/drive/MyDrive/MITHackathon/Surveydata_train.csv') 
survey_test = pd.read_csv('/content/drive/MyDrive/MITHackathon/Surveydata_test.csv') 
travel_train = pd.read_csv('/content/drive/MyDrive/MITHackathon/Traveldata_train.csv') 
travel_test = pd.read_csv('/content/drive/MyDrive/MITHackathon/Traveldata_test.csv') 


**Getting Info**

In [4]:
survey_train.shape


(94379, 17)

In [5]:
survey_train.head()

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,0,Needs Improvement,Green Car,Excellent,Excellent,Very Convenient,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor
1,98800002,0,Poor,Ordinary,Excellent,Poor,Needs Improvement,Good,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good
2,98800003,1,Needs Improvement,Green Car,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent
3,98800004,0,Acceptable,Ordinary,Needs Improvement,,Needs Improvement,Acceptable,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable
4,98800005,1,Acceptable,Ordinary,Acceptable,Acceptable,Manageable,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Good


In [6]:
survey_train.tail()

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
94374,98894375,0,Poor,Ordinary,Good,Good,Convenient,Poor,Poor,Poor,Poor,Good,Good,Good,Needs Improvement,Good,Poor
94375,98894376,1,Good,Ordinary,Good,Good,Convenient,Needs Improvement,Excellent,Excellent,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Good
94376,98894377,1,Needs Improvement,Green Car,Needs Improvement,Needs Improvement,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Acceptable,Good,Acceptable
94377,98894378,0,Needs Improvement,Ordinary,,Needs Improvement,Convenient,Good,Needs Improvement,Good,Good,Acceptable,Good,Good,Good,Excellent,Good
94378,98894379,0,Acceptable,Ordinary,Poor,Acceptable,Manageable,Acceptable,Acceptable,Acceptable,Acceptable,Poor,Good,Good,Poor,Good,Acceptable


In [7]:
survey_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       94379 non-null  int64 
 1   Overall_Experience       94379 non-null  int64 
 2   Seat_Comfort             94318 non-null  object
 3   Seat_Class               94379 non-null  object
 4   Arrival_Time_Convenient  85449 non-null  object
 5   Catering                 85638 non-null  object
 6   Platform_Location        94349 non-null  object
 7   Onboard_Wifi_Service     94349 non-null  object
 8   Onboard_Entertainment    94361 non-null  object
 9   Online_Support           94288 non-null  object
 10  Ease_of_Online_Booking   94306 non-null  object
 11  Onboard_Service          86778 non-null  object
 12  Legroom                  94289 non-null  object
 13  Baggage_Handling         94237 non-null  object
 14  CheckIn_Service          94302 non-nul

In [8]:
travel_train.shape

(94379, 9)

In [9]:
travel_train.head()

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0


In [10]:
travel_train.tail()

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
94374,98894375,Male,Loyal Customer,32.0,Business Travel,Business,1357,83.0,125.0
94375,98894376,Male,Loyal Customer,44.0,Business Travel,Business,592,5.0,11.0
94376,98894377,Male,,63.0,Business Travel,Business,2794,0.0,0.0
94377,98894378,Male,Loyal Customer,16.0,Personal Travel,Eco,2744,0.0,0.0
94378,98894379,Male,Loyal Customer,54.0,,Eco,2107,28.0,28.0


In [11]:
travel_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


**Merging Data Frames**

In [12]:
df = travel_train.merge(survey_train, how='outer', on= 'ID', indicator= True)

In [13]:
df

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,_merge
0,98800001,Female,Loyal Customer,52.00000,,Business,272,0.00000,5.00000,0,Needs Improvement,Green Car,Excellent,Excellent,Very Convenient,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor,both
1,98800002,Male,Loyal Customer,48.00000,Personal Travel,Eco,2200,9.00000,0.00000,0,Poor,Ordinary,Excellent,Poor,Needs Improvement,Good,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good,both
2,98800003,Female,Loyal Customer,43.00000,Business Travel,Business,1061,77.00000,119.00000,1,Needs Improvement,Green Car,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent,both
3,98800004,Female,Loyal Customer,44.00000,Business Travel,Business,780,13.00000,18.00000,0,Acceptable,Ordinary,Needs Improvement,,Needs Improvement,Acceptable,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable,both
4,98800005,Female,Loyal Customer,50.00000,Business Travel,Business,1981,0.00000,0.00000,1,Acceptable,Ordinary,Acceptable,Acceptable,Manageable,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Good,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,98894375,Male,Loyal Customer,32.00000,Business Travel,Business,1357,83.00000,125.00000,0,Poor,Ordinary,Good,Good,Convenient,Poor,Poor,Poor,Poor,Good,Good,Good,Needs Improvement,Good,Poor,both
94375,98894376,Male,Loyal Customer,44.00000,Business Travel,Business,592,5.00000,11.00000,1,Good,Ordinary,Good,Good,Convenient,Needs Improvement,Excellent,Excellent,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Good,both
94376,98894377,Male,,63.00000,Business Travel,Business,2794,0.00000,0.00000,1,Needs Improvement,Green Car,Needs Improvement,Needs Improvement,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Acceptable,Good,Acceptable,both
94377,98894378,Male,Loyal Customer,16.00000,Personal Travel,Eco,2744,0.00000,0.00000,0,Needs Improvement,Ordinary,,Needs Improvement,Convenient,Good,Needs Improvement,Good,Good,Acceptable,Good,Good,Good,Excellent,Good,both


In [14]:
df.replace(to_replace = "Poor", value = 1, inplace = True, regex = True, )
df.replace(to_replace = "Needs Improvement", value = 2, inplace = True, regex = True, )
df.replace(to_replace = "Acceptable", value = 3, inplace = True, regex = True, )
df.replace(to_replace = "Good", value = 4, inplace = True, regex = True, )
df.replace(to_replace = "Excellent", value = 5, inplace = True, regex = True, )


df.replace(to_replace = "Poor", value = 1, inplace = True, regex = True, )
df.replace(to_replace = "Inconvenient", value = 2, inplace = True, regex = True, )
df.replace(to_replace = "Manageable", value = 3, inplace = True, regex = True, )
df.replace(to_replace = "Convenient", value = 4, inplace = True, regex = True, )
df.replace(to_replace = "Very Convenient", value = 5, inplace = True, regex = True, )

In [15]:
df.head()

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,_merge
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,2,Green Car,5,5.0,4,4,2,3,2,2,3,2,4,2,1,both
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,0,1,Ordinary,5,1.0,2,4,1,4,4,5,2,1,2,4,4,both
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,1,2,Green Car,2,2.0,2,2,4,5,5,5,5,5,4,5,5,both
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,0,3,Ordinary,2,,2,3,2,3,3,3,3,3,4,3,3,both
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,1,3,Ordinary,3,3.0,3,2,4,5,4,4,4,4,4,4,4,both


In [16]:
df.index = [np.arange(98800001,98894380)]
df.head(15)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,_merge
98800001,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,2,Green Car,5.0,5.0,4,4,2,3.0,2.0,2,3,2.0,4.0,2,1,both
98800002,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,0,1,Ordinary,5.0,1.0,2,4,1,4.0,4.0,5,2,1.0,2.0,4,4,both
98800003,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,1,2,Green Car,2.0,2.0,2,2,4,5.0,5.0,5,5,5.0,4.0,5,5,both
98800004,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,0,3,Ordinary,2.0,,2,3,2,3.0,3.0,3,3,3.0,4.0,3,3,both
98800005,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,1,3,Ordinary,3.0,3.0,3,2,4,5.0,4.0,4,4,4.0,4.0,4,4,both
98800006,98800006,Male,Loyal Customer,44.0,Business Travel,Business,2810,0.0,0.0,1,2,Ordinary,2.0,3.0,2,4,5,4.0,3.0,3,4,3.0,3.0,3,3,both
98800007,98800007,Male,Loyal Customer,56.0,Personal Travel,Eco,2029,0.0,0.0,0,4,Green Car,5.0,4.0,2,4,4,4.0,4.0,4,5,2.0,5.0,3,4,both
98800008,98800008,Male,Loyal Customer,65.0,Personal Travel,Business,853,0.0,3.0,0,3,Green Car,,3.0,2,5,5,4.0,4.0,4,3,4.0,4.0,4,5,both
98800009,98800009,Male,Loyal Customer,22.0,Personal Travel,Eco,1636,1.0,0.0,0,2,Green Car,2.0,2.0,2,3,2,3.0,3.0,3,3,2.0,3.0,2,3,both
98800010,98800010,Male,Loyal Customer,57.0,Business Travel,Business,306,0.0,0.0,0,2,Ordinary,4.0,4.0,4,3,4,4.0,2.0,2,2,2.0,3.0,2,3,both


In [17]:
df=df.drop(['_merge','ID'],axis=1)

In [18]:
df.head()

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,2,Green Car,5,5.0,4,4,2,3,2,2,3,2,4,2,1
98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,0,1,Ordinary,5,1.0,2,4,1,4,4,5,2,1,2,4,4
98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,1,2,Green Car,2,2.0,2,2,4,5,5,5,5,5,4,5,5
98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,0,3,Ordinary,2,,2,3,2,3,3,3,3,3,4,3,3
98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,1,3,Ordinary,3,3.0,3,2,4,5,4,4,4,4,4,4,4


In [19]:
# Creating list of dummy columns
to_get_dummies_for = ['Gender', 'Customer_Type','Type_Travel', 'Travel_Class', ]

# Creating dummy variables
pd.get_dummies(data = df, columns = to_get_dummies_for, drop_first = True)      

# Mapping overtime and attrition
dict_Gender = {'Male': 1, 'Female':0}
dict_Customer_Type = {'Loyal Customer': 1, 'Disloyal Customer': 0}
dict_Type_Travel = {'Business Travel': 1, 'Personal Travel': 0}
dict_Travel_Class = {'Business': 1, 'Eco': 0}


df['Gender'] = df.Gender.map(dict_Gender)
df['Customer_Type'] = df.Customer_Type.map(dict_Customer_Type)
df['Type_Travel'] = df.Type_Travel.map(dict_Type_Travel)
df['Travel_Class'] = df.Travel_Class.map(dict_Travel_Class)
df['Seat_Class'] = df.Seat_Class.map(dict_Travel_Class)

In [20]:
Y= df.Overall_Experience
X= df.drop(columns = ['Overall_Experience'])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1, stratify = Y)

In [22]:
df.fillna(df.median(),inplace = True)

In [23]:
# Creating metric function 
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))
    
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Attrite', 'Attrite'], yticklabels=['Not Attrite', 'Attrite'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [24]:
df

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
98800001,0.00000,1.00000,52.00000,1.00000,1,272,0.00000,5.00000,0,2.00000,,5.00000,5.00000,4.00000,4.00000,2.00000,3.00000,2.00000,2.00000,3.00000,2.00000,4.00000,2.00000,1.00000
98800002,1.00000,1.00000,48.00000,0.00000,0,2200,9.00000,0.00000,0,1.00000,,5.00000,1.00000,2.00000,4.00000,1.00000,4.00000,4.00000,5.00000,2.00000,1.00000,2.00000,4.00000,4.00000
98800003,0.00000,1.00000,43.00000,1.00000,1,1061,77.00000,119.00000,1,2.00000,,2.00000,2.00000,2.00000,2.00000,4.00000,5.00000,5.00000,5.00000,5.00000,5.00000,4.00000,5.00000,5.00000
98800004,0.00000,1.00000,44.00000,1.00000,1,780,13.00000,18.00000,0,3.00000,,2.00000,3.00000,2.00000,3.00000,2.00000,3.00000,3.00000,3.00000,3.00000,3.00000,4.00000,3.00000,3.00000
98800005,0.00000,1.00000,50.00000,1.00000,1,1981,0.00000,0.00000,1,3.00000,,3.00000,3.00000,3.00000,2.00000,4.00000,5.00000,4.00000,4.00000,4.00000,4.00000,4.00000,4.00000,4.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98894375,1.00000,1.00000,32.00000,1.00000,1,1357,83.00000,125.00000,0,1.00000,,4.00000,4.00000,4.00000,1.00000,1.00000,1.00000,1.00000,4.00000,4.00000,4.00000,2.00000,4.00000,1.00000
98894376,1.00000,1.00000,44.00000,1.00000,1,592,5.00000,11.00000,1,4.00000,,4.00000,4.00000,4.00000,2.00000,5.00000,5.00000,3.00000,3.00000,3.00000,3.00000,4.00000,3.00000,4.00000
98894377,1.00000,1.00000,63.00000,1.00000,1,2794,0.00000,0.00000,1,2.00000,,2.00000,2.00000,2.00000,4.00000,5.00000,4.00000,4.00000,4.00000,4.00000,4.00000,3.00000,4.00000,3.00000
98894378,1.00000,1.00000,16.00000,0.00000,0,2744,0.00000,0.00000,0,2.00000,,3.00000,2.00000,4.00000,4.00000,2.00000,4.00000,4.00000,3.00000,4.00000,4.00000,4.00000,5.00000,4.00000


In [25]:
round(df.isnull().sum() / df.isnull().count() * 100, 2)


Gender                      0.00000
Customer_Type               0.00000
Age                         0.00000
Type_Travel                 0.00000
Travel_Class                0.00000
Travel_Distance             0.00000
Departure_Delay_in_Mins     0.00000
Arrival_Delay_in_Mins       0.00000
Overall_Experience          0.00000
Seat_Comfort                0.00000
Seat_Class                100.00000
Arrival_Time_Convenient     0.00000
Catering                    0.00000
Platform_Location           0.00000
Onboard_Wifi_Service        0.00000
Onboard_Entertainment       0.00000
Online_Support              0.00000
Ease_of_Online_Booking      0.00000
Onboard_Service             0.00000
Legroom                     0.00000
Baggage_Handling            0.00000
CheckIn_Service             0.00000
Cleanliness                 0.00000
Online_Boarding             0.00000
dtype: float64

**Decision Tree**

In [26]:
X_train

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
98890113,1.00000,1.00000,49.00000,1.00000,1,2023,64.00000,56.00000,1,,1,2,2,4,4,5,5,5,5,5,5,5,5
98854259,0.00000,1.00000,45.00000,1.00000,1,4879,160.00000,146.00000,3,,5,5,4,3,3,3,1,1,3,3,3,4,3
98858137,1.00000,1.00000,25.00000,,1,3779,0.00000,0.00000,3,,1,1,2,3,3,3,3,4,5,3,2,3,3
98823289,0.00000,0.00000,21.00000,1.00000,0,1928,0.00000,20.00000,5,,5,5,4,2,5,2,2,3,1,4,3,1,2
98831835,1.00000,0.00000,35.00000,,1,2331,2.00000,1.00000,2,,2,2,3,5,2,5,5,3,3,3,3,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98807581,1.00000,1.00000,57.00000,1.00000,1,741,12.00000,,4,,4,,4,3,4,5,4,4,4,4,3,4,5
98886703,1.00000,1.00000,34.00000,1.00000,1,87,0.00000,0.00000,2,,2,2,2,4,5,3,4,4,4,3,5,4,5
98836614,0.00000,1.00000,54.00000,1.00000,0,62,0.00000,0.00000,3,,3,,3,5,3,5,2,2,3,2,1,2,1
98800530,0.00000,1.00000,35.00000,1.00000,1,1227,14.00000,14.00000,1,,3,,3,5,4,4,1,1,1,1,4,1,3


In [27]:
y_train

98890113    1
98854259    0
98858137    0
98823289    1
98831835    0
           ..
98807581    1
98886703    1
98836614    1
98800530    0
98868265    0
Name: Overall_Experience, Length: 66065, dtype: int64

In [29]:
X_train

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
98890113,1.00000,1.00000,49.00000,1.00000,1,2023,64.00000,56.00000,1,,1,2,2,4,4,5,5,5,5,5,5,5,5
98854259,0.00000,1.00000,45.00000,1.00000,1,4879,160.00000,146.00000,3,,5,5,4,3,3,3,1,1,3,3,3,4,3
98858137,1.00000,1.00000,25.00000,,1,3779,0.00000,0.00000,3,,1,1,2,3,3,3,3,4,5,3,2,3,3
98823289,0.00000,0.00000,21.00000,1.00000,0,1928,0.00000,20.00000,5,,5,5,4,2,5,2,2,3,1,4,3,1,2
98831835,1.00000,0.00000,35.00000,,1,2331,2.00000,1.00000,2,,2,2,3,5,2,5,5,3,3,3,3,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98807581,1.00000,1.00000,57.00000,1.00000,1,741,12.00000,,4,,4,,4,3,4,5,4,4,4,4,3,4,5
98886703,1.00000,1.00000,34.00000,1.00000,1,87,0.00000,0.00000,2,,2,2,2,4,5,3,4,4,4,3,5,4,5
98836614,0.00000,1.00000,54.00000,1.00000,0,62,0.00000,0.00000,3,,3,,3,5,3,5,2,2,3,2,1,2,1
98800530,0.00000,1.00000,35.00000,1.00000,1,1227,14.00000,14.00000,1,,3,,3,5,4,4,1,1,1,1,4,1,3


In [30]:
dt = DecisionTreeClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)


In [31]:
dt.fit(X_train, y_train)

ValueError: ignored

In [None]:
np.any(np.isnan(df))


In [None]:
np.all(np.isfinite(df))


In [None]:
# Checking performance on the training dataset
y_train_pred_dt = dt.predict(X_train)

metrics_score(y_train, y_train_pred_dt)


In [None]:
# Checking performance on the test dataset
y_test_pred_dt = dt.predict(X_test)

metrics_score(y_test, y_test_pred_dt)


In [None]:
importances = dt.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance,importance_df.index)


In [None]:
features = list(X.columns)

plt.figure(figsize = (30, 20))

tree.plot_tree(dt, max_depth = 4, feature_names = features, filled = True, fontsize = 12, node_ids = True, class_names = True)

plt.show()


In [None]:
# Fitting the Random Forest classifier on the training data
rf_estimator = RandomForestClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)

rf_estimator.fit(X_train, y_train)


In [None]:
# Checking performance on the training data
y_pred_train_rf = rf_estimator.predict(X_train)

metrics_score(y_train, y_pred_train_rf)

In [None]:
# Checking performance on the testing data
y_pred_test_rf = rf_estimator.predict(X_test)

metrics_score(y_test, y_pred_test_rf)


In [None]:
importances = rf_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance, importance_df.index)
