Importing libraries

In [24]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier as RF

  Importing data

In [25]:
survey_data = pd.read_csv("survey_data.csv")

Printing the columns in the dataset

In [26]:
survey_data.columns

Index(['seat_comfort', 'seat_cleanliness', 'flight_attendant_courtesy',
       'flight_attendant_attentiveness', 'food_quality', 'was_flight_delayed',
       'delay_minutes', 'delay_handling', 'overall_customer_satisfaction'],
      dtype='object')

Checking whether there is any null values in any column of dataset

In [27]:
survey_data.isnull().sum()

seat_comfort                        0
seat_cleanliness                    0
flight_attendant_courtesy           0
flight_attendant_attentiveness      0
food_quality                        0
was_flight_delayed                  0
delay_minutes                     304
delay_handling                    304
overall_customer_satisfaction       0
dtype: int64

Slicing only the missing data to check whether there is any common pattern

In [28]:
missing_data = survey_data[survey_data.isnull().any(axis=1)]

Displaying few records of the missing data

In [29]:
missing_data.head()

Unnamed: 0,seat_comfort,seat_cleanliness,flight_attendant_courtesy,flight_attendant_attentiveness,food_quality,was_flight_delayed,delay_minutes,delay_handling,overall_customer_satisfaction
11,4,1,1,4,3,NO,,,4
13,5,1,2,4,2,NO,,,6
15,2,2,1,3,3,NO,,,2
20,4,4,2,5,3,NO,,,6
22,5,3,5,3,1,NO,,,8


Getting unique values of 'was_flight_delayed' field as I see that all rows displayed has values 'NO' 

In [30]:
len(missing_data['was_flight_delayed'].unique())

1

Replacing null values with 0 for column 'delay_minutes' and null value with 6 for column 'delay_handling'

In [31]:
survey_data.fillna(value={"delay_minutes":0, "delay_handling":6}, inplace=True)

Unnamed: 0,seat_comfort,seat_cleanliness,flight_attendant_courtesy,flight_attendant_attentiveness,food_quality,was_flight_delayed,delay_minutes,delay_handling,overall_customer_satisfaction
0,3,3,3,3,3,YES,12.0,4.0,6
1,2,5,1,4,2,YES,102.0,4.0,9
2,1,5,1,3,5,YES,117.0,4.0,8
3,5,5,4,5,4,YES,53.0,4.0,8
4,3,2,3,4,3,YES,20.0,5.0,6
5,4,1,2,1,2,YES,40.0,4.0,8
6,5,4,4,4,1,YES,11.0,2.0,6
7,5,1,5,3,3,YES,120.0,4.0,9
8,2,2,1,3,2,YES,49.0,5.0,8
9,5,5,5,3,1,YES,68.0,5.0,9


Changing datatype of column 'was_flight_delayed' to categorical variable

In [32]:
survey_data["was_flight_delayed"] = survey_data["was_flight_delayed"].astype("category").cat.codes

Splitting the target attribute from reamining attributes

In [33]:
X = survey_data.iloc[:,:-1]
Y = survey_data.iloc[:,-1]

Fitting random forest

In [34]:
model = RF(n_estimators=200, min_samples_leaf=5, max_depth=15, max_leaf_nodes=100)
model.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=100,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Displaying the attributes based on their importance in descending order

In [35]:
sorted(list(zip(X.columns, model.feature_importances_ )), key=lambda x:x[1], reverse=True)

[('seat_comfort', 0.21249507793907699),
 ('flight_attendant_courtesy', 0.19043280787152705),
 ('delay_handling', 0.18477550751434005),
 ('delay_minutes', 0.17653534627696527),
 ('food_quality', 0.10264542063195503),
 ('flight_attendant_attentiveness', 0.055549515895906516),
 ('seat_cleanliness', 0.050001289087814607),
 ('was_flight_delayed', 0.027565034782414375)]