Step 1. Load & Understand Data

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



df = pd.read_csv('Airline Customer Satisfaction Survey.csv')
print(df.shape)

(25976, 25)


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Gender                             25976 non-null  object 
 3   Customer Type                      25976 non-null  object 
 4   Age                                25976 non-null  int64  
 5   Type of Travel                     25976 non-null  object 
 6   Class                              25976 non-null  object 
 7   Flight Distance                    25976 non-null  int64  
 8   Inflight wifi service              25976 non-null  int64  
 9   Departure/Arrival time convenient  25976 non-null  int64  
 10  Ease of Online booking             25976 non-null  int64  
 11  Gate location                      25976 non-null  int

In [38]:
# Dropping the 'Unnamed: 0' and 'id' columns
df = df.drop(['Unnamed: 0', 'id'], axis=1)

Categorical variables identified : Gender, Customer Type, Type of Travel, Class, satisfaction

In [39]:
# Convert categorical predictors to dummy variables
df = pd.get_dummies(df, drop_first=True)

# Convert all columns to float data type
df = df.astype(float)
df

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,52.0,160.0,5.0,4.0,3.0,4.0,3.0,4.0,3.0,5.0,...,5.0,5.0,50.0,44.0,0.0,0.0,0.0,1.0,0.0,1.0
1,36.0,2863.0,1.0,1.0,3.0,1.0,5.0,4.0,5.0,4.0,...,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,20.0,192.0,2.0,0.0,2.0,4.0,2.0,2.0,2.0,2.0,...,2.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,44.0,3377.0,0.0,0.0,0.0,2.0,3.0,4.0,4.0,1.0,...,1.0,4.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0
4,49.0,1182.0,2.0,3.0,4.0,3.0,4.0,1.0,2.0,2.0,...,2.0,4.0,0.0,20.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,34.0,526.0,3.0,3.0,3.0,1.0,4.0,3.0,4.0,4.0,...,5.0,4.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
25972,23.0,646.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,5.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25973,17.0,828.0,2.0,5.0,1.0,5.0,2.0,1.0,2.0,2.0,...,4.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25974,14.0,1127.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,...,5.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                25976 non-null  float64
 1   Flight Distance                    25976 non-null  float64
 2   Inflight wifi service              25976 non-null  float64
 3   Departure/Arrival time convenient  25976 non-null  float64
 4   Ease of Online booking             25976 non-null  float64
 5   Gate location                      25976 non-null  float64
 6   Food and drink                     25976 non-null  float64
 7   Online boarding                    25976 non-null  float64
 8   Seat comfort                       25976 non-null  float64
 9   Inflight entertainment             25976 non-null  float64
 10  On-board service                   25976 non-null  float64
 11  Leg room service                   25976 non-null  flo

Treatment of NA values -> not to drop the columns but to instead drop the rows. 

In [41]:
# Drop rows with NA values
df = df.dropna()

In [42]:
print(df.shape)

(25893, 24)


In [43]:
#double checking that NA rows were dropped. 
df.isna().sum()

Age                                  0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
Gender_Male                          0
Customer Type_disloyal Customer      0
Type of Travel_Personal Travel       0
Class_Eco                            0
Class_Eco Plus                       0
satisfaction_satisfied               0
dtype: int64

split data into train and test

In [46]:
import statsmodels.api as sm
# Extract predictors and response variables
X = df.drop('satisfaction_satisfied', axis=1)
y = df['satisfaction_satisfied']

# Add constant column for intercept
X = sm.add_constant(X)

X.dtypes

const                                float64
Age                                  float64
Flight Distance                      float64
Inflight wifi service                float64
Departure/Arrival time convenient    float64
Ease of Online booking               float64
Gate location                        float64
Food and drink                       float64
Online boarding                      float64
Seat comfort                         float64
Inflight entertainment               float64
On-board service                     float64
Leg room service                     float64
Baggage handling                     float64
Checkin service                      float64
Inflight service                     float64
Cleanliness                          float64
Departure Delay in Minutes           float64
Arrival Delay in Minutes             float64
Gender_Male                          float64
Customer Type_disloyal Customer      float64
Type of Travel_Personal Travel       float64
Class_Eco 

In [50]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit OLS regression model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Add constant column for intercept in the test set
X_test = sm.add_constant(X_test)

# Make predictions on the test set
y_pred = results.predict(X_test)

# Print regression summary
print(results.summary())

                              OLS Regression Results                              
Dep. Variable:     satisfaction_satisfied   R-squared:                       0.553
Model:                                OLS   Adj. R-squared:                  0.552
Method:                     Least Squares   F-statistic:                     972.6
Date:                    Sat, 10 Jun 2023   Prob (F-statistic):               0.00
Time:                            14:28:29   Log-Likelihood:                -5714.9
No. Observations:                   18125   AIC:                         1.148e+04
Df Residuals:                       18101   BIC:                         1.167e+04
Df Model:                              23                                         
Covariance Type:                nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [51]:
print(y_train.shape)
print(X_train.shape)

(18125,)
(18125, 24)


In [52]:
print(y_test.shape)
print(X_test.shape)

(7768,)
(7768, 24)


scale train & test data by standardScaler()

In [53]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Scale the train set
X_train_scaled = scaler.fit_transform(X_train)

# Scale the test set
X_test_scaled = scaler.transform(X_test)
