## 1. Import Libraries
#### Let us import the required libraries and functions

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import scipy.stats

import warnings
warnings.filterwarnings('ignore')

## 2. Read Data
### Read and display data to gey insights from the data

In [2]:
data  = pd.read_csv('airline_passenger_satisfaction.csv') # read csv file using pandas
data.head() # display first 5 records of the dataframe

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied


In [3]:
data.shape

(129880, 24)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ID                                      129880 non-null  int64  
 1   Gender                                  129880 non-null  object 
 2   Age                                     129880 non-null  int64  
 3   Customer Type                           129880 non-null  object 
 4   Type of Travel                          129880 non-null  object 
 5   Class                                   129880 non-null  object 
 6   Flight Distance                         129880 non-null  int64  
 7   Departure Delay                         129880 non-null  int64  
 8   Arrival Delay                           129487 non-null  float64
 9   Departure and Arrival Time Convenience  129880 non-null  int64  
 10  Ease of Online Booking                  1298

In [5]:
data.drop('ID',axis=1,inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 23 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Gender                                  129880 non-null  object 
 1   Age                                     129880 non-null  int64  
 2   Customer Type                           129880 non-null  object 
 3   Type of Travel                          129880 non-null  object 
 4   Class                                   129880 non-null  object 
 5   Flight Distance                         129880 non-null  int64  
 6   Departure Delay                         129880 non-null  int64  
 7   Arrival Delay                           129487 non-null  float64
 8   Departure and Arrival Time Convenience  129880 non-null  int64  
 9   Ease of Online Booking                  129880 non-null  int64  
 10  Check-in Service                        1298

In [7]:
# drop na values 
na_values = data.isnull().sum()
data = data.dropna()

## <font color='red'>Find the p-value for each column with target column</font>

In [8]:
df = data.copy()

In [9]:
df = df.dropna()

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129487 entries, 0 to 129879
Data columns (total 23 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Gender                                  129487 non-null  object 
 1   Age                                     129487 non-null  int64  
 2   Customer Type                           129487 non-null  object 
 3   Type of Travel                          129487 non-null  object 
 4   Class                                   129487 non-null  object 
 5   Flight Distance                         129487 non-null  int64  
 6   Departure Delay                         129487 non-null  int64  
 7   Arrival Delay                           129487 non-null  float64
 8   Departure and Arrival Time Convenience  129487 non-null  int64  
 9   Ease of Online Booking                  129487 non-null  int64  
 10  Check-in Service                        1294

In [11]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
df['Satisfaction'] = lb.fit_transform(df['Satisfaction'])
df['Gender'] = lb.fit_transform(df['Gender'])
df['Customer Type'] = lb.fit_transform(df['Customer Type'])
df['Type of Travel'] = lb.fit_transform(df['Type of Travel'])
df['Class'] = lb.fit_transform(df['Class'])

In [12]:
df['Arrival Delay'] = df['Arrival Delay'].astype(int)

In [13]:
from scipy.stats import pearsonr

corr_df = pd.DataFrame(columns=['Correlation','P_value'])

for col in df:
    if pd.api.types.is_numeric_dtype(df[col]) and col != 'Satisfaction':
        r, p = pearsonr(df['Satisfaction'],df[col])
        corr_df.loc[col] = [round(r,3),round(p,3)]
        
print('Correlation between Target with Columns')
print('++++++++++++++++++++++++++++++++++++++++')
corr_df

Correlation between Target with Columns
++++++++++++++++++++++++++++++++++++++++


Unnamed: 0,Correlation,P_value
Gender,0.011,0.0
Age,0.134,0.0
Customer Type,0.186,0.0
Type of Travel,-0.45,0.0
Class,-0.448,0.0
Flight Distance,0.298,0.0
Departure Delay,-0.051,0.0
Arrival Delay,-0.058,0.0
Departure and Arrival Time Convenience,-0.054,0.0
Ease of Online Booking,0.169,0.0


## <font color='red'> Get Chi square test for each combination</font>

In [14]:
num_df = data.select_dtypes(include=np.number)
num_df.head()

Unnamed: 0,Age,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Ease of Online Booking,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling
0,48,821,2,5.0,3,3,4,3,3,3,5,2,5,5,5,3,5,5
1,35,821,26,39.0,2,2,3,5,2,5,4,5,5,3,5,2,5,5
2,41,853,0,0.0,4,4,4,5,4,3,5,3,5,5,3,4,3,3
3,50,1905,0,0.0,2,2,3,4,2,5,5,5,4,4,5,2,5,5
4,49,3470,0,1.0,3,3,3,5,3,3,4,4,5,4,3,3,3,3


In [15]:
cat_df = data.select_dtypes(exclude=np.number)
cat_df.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,Satisfaction
0,Male,First-time,Business,Business,Neutral or Dissatisfied
1,Female,Returning,Business,Business,Satisfied
2,Male,Returning,Business,Business,Satisfied
3,Male,Returning,Business,Business,Satisfied
4,Female,Returning,Business,Business,Satisfied


In [16]:
cat_df.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,Satisfaction
0,Male,First-time,Business,Business,Neutral or Dissatisfied
1,Female,Returning,Business,Business,Satisfied
2,Male,Returning,Business,Business,Satisfied
3,Male,Returning,Business,Business,Satisfied
4,Female,Returning,Business,Business,Satisfied


In [17]:
ct_table_ind = pd.crosstab(cat_df["Satisfaction"],cat_df["Gender"],margins=True)
print('contingency_table :\n')
ct_table_ind

contingency_table :



Gender,Female,Male,All
Satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Neutral or Dissatisfied,37524,35701,73225
Satisfied,28179,28083,56262
All,65703,63784,129487


In [18]:
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(ct_table_ind, correction=True)
print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")

Chi2 result of the contingency table: 17.1131132378773, p-value: 0.0018375370881357966


### <font color='blue'>We can reject the null hypothesis as the p-value is less than 0.05. Thus, the results indicate that there is a relationship between Gender with Satisfaction</font>

In [19]:
ct_table_CT = pd.crosstab(cat_df["Satisfaction"],cat_df["Customer Type"],margins=True)
print('contingency_table :\n')
ct_table_CT

contingency_table :



Customer Type,First-time,Returning,All
Satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Neutral or Dissatisfied,18026,55199,73225
Satisfied,5688,50574,56262
All,23714,105773,129487


In [20]:
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(ct_table_CT, correction=True)
print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")

Chi2 result of the contingency table: 4476.124756035881, p-value: 0.0


### <font color='blue'>We can reject the null hypothesis as the p-value is less than 0.05. Thus, the results indicate that there is a relationship between Customer Type with Satisfaction</font>

In [21]:
ct_table_TT = pd.crosstab(cat_df["Satisfaction"],cat_df["Type of Travel"],margins=True)
print('contingency_table :\n')
ct_table_TT

contingency_table :



Type of Travel,Business,Personal,All
Satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Neutral or Dissatisfied,37238,35987,73225
Satisfied,52207,4055,56262
All,89445,40042,129487


In [22]:
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(ct_table_TT, correction=True)
print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")

Chi2 result of the contingency table: 26197.103065161136, p-value: 0.0


### <font color='blue'>We can reject the null hypothesis as the p-value is less than 0.05. Thus, the results indicate that there is a relationship between Type of Travel with Satisfaction</font>

In [23]:
ct_table_CL = pd.crosstab(cat_df["Satisfaction"],cat_df["Class"],margins=True)
print('contingency_table :\n')
ct_table_CL

contingency_table :



Class,Business,Economy,Economy Plus,All
Satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Neutral or Dissatisfied,18940,47215,7070,73225
Satisfied,43050,10902,2310,56262
All,61990,58117,9380,129487


In [24]:
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(ct_table_CL, correction=True)
print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")

Chi2 result of the contingency table: 32823.12402964362, p-value: 0.0


### <font color='blue'>We can reject the null hypothesis as the p-value is less than 0.05. Thus, the results indicate that there is a relationship between Class with Satisfaction</font>

## <font color='red'>Perform ANOVA Test Among each combination</font>

In [25]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [26]:
df1 = df.copy()
df1.columns

Index(['Gender', 'Age', 'Customer Type', 'Type of Travel', 'Class',
       'Flight Distance', 'Departure Delay', 'Arrival Delay',
       'Departure and Arrival Time Convenience', 'Ease of Online Booking',
       'Check-in Service', 'Online Boarding', 'Gate Location',
       'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness',
       'Food and Drink', 'In-flight Service', 'In-flight Wifi Service',
       'In-flight Entertainment', 'Baggage Handling', 'Satisfaction'],
      dtype='object')

In [27]:
# remove spaces in columns name
df1.columns = df1.columns.str.replace(' ','_')

In [28]:
df1.columns

Index(['Gender', 'Age', 'Customer_Type', 'Type_of_Travel', 'Class',
       'Flight_Distance', 'Departure_Delay', 'Arrival_Delay',
       'Departure_and_Arrival_Time_Convenience', 'Ease_of_Online_Booking',
       'Check-in_Service', 'Online_Boarding', 'Gate_Location',
       'On-board_Service', 'Seat_Comfort', 'Leg_Room_Service', 'Cleanliness',
       'Food_and_Drink', 'In-flight_Service', 'In-flight_Wifi_Service',
       'In-flight_Entertainment', 'Baggage_Handling', 'Satisfaction'],
      dtype='object')

## While working with full data we facing memory error. To resolve this problem we will take sample data from original dataset.

In [29]:
# Generating 25% sample of data frame
df_sub = df1.sample(frac=0.25, random_state=2)
df_sub.head()

Unnamed: 0,Gender,Age,Customer_Type,Type_of_Travel,Class,Flight_Distance,Departure_Delay,Arrival_Delay,Departure_and_Arrival_Time_Convenience,Ease_of_Online_Booking,...,On-board_Service,Seat_Comfort,Leg_Room_Service,Cleanliness,Food_and_Drink,In-flight_Service,In-flight_Wifi_Service,In-flight_Entertainment,Baggage_Handling,Satisfaction
16085,0,53,1,0,0,303,4,0,0,0,...,1,2,1,3,3,1,0,1,1,1
79087,0,60,1,1,1,226,0,0,4,2,...,5,5,2,3,4,5,2,5,2,0
121469,0,29,1,1,1,331,0,0,5,3,...,4,4,4,4,4,5,3,4,5,0
9645,0,52,1,0,0,3354,10,0,5,5,...,4,4,4,3,2,4,5,4,4,1
62828,0,58,1,1,1,2338,0,0,5,3,...,4,4,3,4,4,4,3,4,4,0


In [43]:
# Two-Way ANOVA Result:
new = ols('Satisfaction ~ C(Gender) + C(Age) + C(Gender):C(Age)',data=df_sub).fit()
an = sm.stats.anova_lm(new,type=2)
an

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Gender),1.0,0.815694,0.815694,3.629055,0.056788
C(Age),74.0,689.522886,9.317877,41.455585,0.0
C(Gender):C(Age),74.0,16.87132,0.227991,1.01434,0.44382
Residual,32222.0,7242.465127,0.224768,,


We can also see that each of the two factors (Gender and Age) were statistically significant with the following p-values:

>- P-value of Gender: 0.056788
>- P-value of Age: 0.00000
>- P-value of Gender*Age: 0.443820
    
Since the p-values for Gender and Age are both less than .05, this means that both factors have a statistically significant effect on Customer Satisfaction.

And since the p-value for the interaction effect (0.443820) is not less than .05, this tells us that there is no significant interaction effect between Gender and Age.

In [32]:
new1 = ols('Satisfaction ~ C(Gender) + C(Customer_Type) + C(Gender):C(Customer_Type)',data=df_sub).fit()
an1 = sm.stats.anova_lm(new1,type=2)
an1

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Gender),1.0,0.815694,0.815694,3.438475,0.06370356
C(Customer_Type),1.0,269.711068,269.711068,1136.93884,5.4331619999999994e-245
C(Gender):C(Customer_Type),1.0,0.628071,0.628071,2.647567,0.1037182
Residual,32368.0,7678.520195,0.237226,,


We can also see that each of the two factors (Gender and Customer_Type) were statistically significant with the following p-values:

>- P-value of Gender: 6.370356e-02
>- P-value of Customer_Type: 5.433162e-245
>- P-value of Gender*Customer_Type: 1.037182e-01
    
Since the p-values for Gender, Customer_Type and their interaction effect are both less than .05, this means that both factors have a statistically significant effect on Customer Satisfaction.


In [49]:
# Tukey’s Test – useful when you want to make every possible pairwise comparison

from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

# perform Tukey's test
mc_gender = MultiComparison(df_sub['Satisfaction'],df_sub['In-flight_Entertainment'])
mc_gender_result = mc_gender.tukeyhsd(alpha=0.05)
mc_gender_result.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
0,1,0.1421,0.9,-0.3413,0.6254,False
0,2,0.2035,0.8158,-0.2797,0.6868,False
0,3,0.276,0.569,-0.2072,0.7592,False
0,4,0.6131,0.004,0.13,1.0963,True
0,5,0.6525,0.0017,0.1693,1.1356,True
1,2,0.0615,0.001,0.0347,0.0882,True
1,3,0.1339,0.001,0.1077,0.1601,True
1,4,0.4711,0.001,0.4467,0.4955,True
1,5,0.5104,0.001,0.4854,0.5354,True
2,3,0.0725,0.001,0.0486,0.0963,True


In [34]:
new2 = ols('Satisfaction ~ C(Gender) + C(Ease_of_Online_Booking) + C(Gender):C(Ease_of_Online_Booking)',data=df_sub).fit()
an2 = sm.stats.anova_lm(new2,type=2)
an2

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Gender),1.0,0.815694,0.815694,3.69697,0.05452
C(Ease_of_Online_Booking),5.0,807.731645,161.546329,732.176085,0.0
C(Gender):C(Ease_of_Online_Booking),5.0,1.261585,0.252317,1.143576,0.33467
Residual,32360.0,7139.866103,0.220639,,


We can also see that each of the two factors (Gender and Ease_of_Online_Booking) were statistically significant with the following p-values:

>- P-value of Gender: 0.05452
>- P-value of Ease_of_Online_Booking: 0.05452
>- P-value of Gender*Ease_of_Online_Booking: 0.33467
    
Since the p-values for Gender and Ease_of_Online_Booking are both less than .05, this means that both factors have a statistically significant effect on Customer Satisfaction.

And since the p-value for the interaction effect (0.33467) is not less than .05, this tells us that there is no significant interaction effect between Gender and Ease_of_Online_Booking.

In [35]:
new3 = ols('Satisfaction ~ C(Class) + C(Seat_Comfort) + C(Class):C(Seat_Comfort)',data=df_sub).fit()
an3 = sm.stats.anova_lm(new3,type=2)
an3

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Class),2.0,2051.923276,1025.961638,6581.737396,0.0
C(Seat_Comfort),4.0,598.133933,149.533483,959.285496,0.0
C(Class):C(Seat_Comfort),8.0,255.806629,31.975829,205.130971,0.0
Residual,32357.0,5043.811189,0.15588,,


In [39]:
new4 = ols('Satisfaction ~ C(Departure_Delay) + C(Departure_and_Arrival_Time_Convenience) + C(Departure_Delay):C(Departure_and_Arrival_Time_Convenience)',data=df_sub).fit()
an4 = sm.stats.anova_lm(new4,type=2)
an4

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Departure_Delay),337.0,114.270556,0.339082,1.392852,2.728915e-06
C(Departure_and_Arrival_Time_Convenience),5.0,34.948644,6.989729,28.71183,3.669232e-29
C(Departure_Delay):C(Departure_and_Arrival_Time_Convenience),1685.0,408.088393,0.242189,0.994844,0.5538728
Residual,31164.0,7586.695567,0.243444,,


In [40]:
new5 = ols('Satisfaction ~ C(Arrival_Delay) + C(Departure_and_Arrival_Time_Convenience) + C(Arrival_Delay):C(Departure_and_Arrival_Time_Convenience)',data=df_sub).fit()
an5 = sm.stats.anova_lm(new5,type=2)
an5

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Arrival_Delay),341.0,160.853744,0.471712,1.94887,1.033135e-22
C(Departure_and_Arrival_Time_Convenience),5.0,35.422439,7.084488,29.269444,9.424473e-30
C(Arrival_Delay):C(Departure_and_Arrival_Time_Convenience),1705.0,411.549822,0.241378,0.99725,0.5270063
Residual,31164.0,7543.053406,0.242044,,


In [51]:
new6 = ols('''Satisfaction ~ C(Customer_Type) + C(Seat_Comfort) + C(Cleanliness) + C(Food_and_Drink)
            + C(Customer_Type):C(Seat_Comfort) + C(Customer_Type):C(Cleanliness) + C(Customer_Type):C(Food_and_Drink)
            + C(Seat_Comfort):C(Cleanliness) + C(Seat_Comfort):C(Food_and_Drink) + C(Cleanliness):C(Food_and_Drink) 
            + C(Customer_Type):C(Seat_Comfort):C(Cleanliness):C(Food_and_Drink)''',data=df_sub).fit()
an6 = sm.stats.anova_lm(new6,type=2)
an6 

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Customer_Type),1.0,270.470209,270.470209,1516.612163,5e-324
C(Seat_Comfort),4.0,1048.260871,262.065218,1469.482715,0.0
C(Cleanliness),5.0,132.114726,26.422945,148.161826,4.73145e-156
C(Food_and_Drink),5.0,39.306184,7.861237,44.080445,1.756009e-45
C(Customer_Type):C(Seat_Comfort),4.0,225.946623,56.486656,316.738577,8.843308000000001e-268
C(Customer_Type):C(Cleanliness),5.0,21.536314,4.307263,24.152187,2.411172e-24
C(Customer_Type):C(Food_and_Drink),5.0,12.47497,2.494994,13.990222,1.086242e-13
C(Seat_Comfort):C(Cleanliness),20.0,184.813761,9.240688,51.815466,1.8633769999999998e-203
C(Seat_Comfort):C(Food_and_Drink),20.0,45.290393,2.26452,12.697879,2.793767e-42
C(Cleanliness):C(Food_and_Drink),25.0,125.402944,5.016118,28.126962,1.007513e-130


We can also see that each of the four factors (Customer_Type, Seat_Comfort, Cleanliness and Cleanliness) were statistically significant with the following p-values:

Since the p-values for Customer_Type, Seat_Comfort, Cleanliness, Cleanliness and interaction effect are both less than .05, this means that both factors have a statistically significant effect on Customer Satisfaction.


In [50]:
new7 = ols('''Satisfaction ~ C(Class) + C(Seat_Comfort) + C(Cleanliness) + C(Food_and_Drink)
            + C(Class):C(Seat_Comfort) + C(Class):C(Cleanliness) + C(Class):C(Food_and_Drink)
            + C(Seat_Comfort):C(Cleanliness) + C(Seat_Comfort):C(Food_and_Drink) + C(Cleanliness):C(Food_and_Drink) 
            + C(Class):C(Seat_Comfort):C(Cleanliness):C(Food_and_Drink)''',data=df_sub).fit()
an7 = sm.stats.anova_lm(new7,type=2)
an7 

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Class),2.0,2051.923276,1025.961638,7239.838396,0.0
C(Seat_Comfort),4.0,598.133933,149.533483,1055.203443,0.0
C(Cleanliness),5.0,119.561377,23.912275,168.740236,9.234814e-178
C(Food_and_Drink),5.0,17.543729,3.508746,24.759944,5.508388e-25
C(Class):C(Seat_Comfort),8.0,245.6783,30.709787,216.707809,0.0
C(Class):C(Cleanliness),10.0,59.74846,5.974846,42.162317,8.808372e-84
C(Class):C(Food_and_Drink),10.0,18.91294,1.891294,13.346174,1.037489e-23
C(Seat_Comfort):C(Cleanliness),20.0,72.218712,3.610936,25.48106,1.821762e-94
C(Seat_Comfort):C(Food_and_Drink),20.0,19.246042,0.962302,6.790616,3.4985889999999997e-19
C(Cleanliness):C(Food_and_Drink),25.0,29.93845,1.197538,8.45059,2.699425e-31


We can also see that each of the four factors (Class, Seat_Comfort, Cleanliness and Cleanliness) were statistically significant with the following p-values:

Since the p-values for Class, Seat_Comfort, Cleanliness, Cleanliness and interaction effect are both less than .05, this means that both factors have a statistically significant effect on Customer Satisfaction.
