# Overview

In [8]:
import pandas as pd

# load csv file
og_data = pd.read_csv("data/train.csv")

# drop first column (unnamed)
og_data = og_data.drop(columns=["Unnamed: 0"])

# apply .title to each column name
og_data.columns = og_data.columns.str.title()

# rename 'Id' column to 'ID'
og_data = og_data.rename(columns={"Id": "ID"})

# save changes to new csv file
og_data.to_csv("data/train_clean.csv", index=False)

og_data


Unnamed: 0,ID,Gender,Customer Type,Age,Type Of Travel,Class,Flight Distance,Inflight Wifi Service,Departure/Arrival Time Convenient,Ease Of Online Booking,...,Inflight Entertainment,On-Board Service,Leg Room Service,Baggage Handling,Checkin Service,Inflight Service,Cleanliness,Departure Delay In Minutes,Arrival Delay In Minutes,Satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


In [9]:
# show nan values
og_data.isnull().sum()


ID                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type Of Travel                         0
Class                                  0
Flight Distance                        0
Inflight Wifi Service                  0
Departure/Arrival Time Convenient      0
Ease Of Online Booking                 0
Gate Location                          0
Food And Drink                         0
Online Boarding                        0
Seat Comfort                           0
Inflight Entertainment                 0
On-Board Service                       0
Leg Room Service                       0
Baggage Handling                       0
Checkin Service                        0
Inflight Service                       0
Cleanliness                            0
Departure Delay In Minutes             0
Arrival Delay In Minutes             310
Satisfaction                           0
dtype: int64

In [10]:
# show number of unique values per col
og_data.nunique()


ID                                   103904
Gender                                    2
Customer Type                             2
Age                                      75
Type Of Travel                            2
Class                                     3
Flight Distance                        3802
Inflight Wifi Service                     6
Departure/Arrival Time Convenient         6
Ease Of Online Booking                    6
Gate Location                             6
Food And Drink                            6
Online Boarding                           6
Seat Comfort                              6
Inflight Entertainment                    6
On-Board Service                          6
Leg Room Service                          6
Baggage Handling                          5
Checkin Service                           6
Inflight Service                          6
Cleanliness                               6
Departure Delay In Minutes              446
Arrival Delay In Minutes        

# Analysis

In [11]:
from helper_functions import extended_describe

data = og_data.copy()


In [12]:
# save extended describe to csv
extended_describe(data).to_csv("data/eda/extended_describe.csv")
extended_describe(data)


Unnamed: 0,ID,Age,Flight Distance,Inflight Wifi Service,Departure/Arrival Time Convenient,Ease Of Online Booking,Gate Location,Food And Drink,Online Boarding,Seat Comfort,Inflight Entertainment,On-Board Service,Leg Room Service,Baggage Handling,Checkin Service,Inflight Service,Cleanliness,Departure Delay In Minutes,Arrival Delay In Minutes
Total de valores,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103594.0
Media,64924.21,39.38,1189.45,2.73,3.06,2.76,2.98,3.2,3.25,3.44,3.36,3.38,3.35,3.63,3.3,3.64,3.29,14.82,15.18
Desviación estándar,37463.81,15.11,997.15,1.33,1.53,1.4,1.28,1.33,1.35,1.32,1.33,1.29,1.32,1.18,1.27,1.18,1.31,38.23,38.7
Mínimo,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Q1,32533.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
Q2,64856.5,40.0,843.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
Q3,97368.25,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
Máximo,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0
RIC,64834.5,24.0,1329.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,12.0,13.0
Asimetría,0.0,-0.0,1.11,0.04,-0.33,-0.02,-0.06,-0.15,-0.45,-0.48,-0.37,-0.42,-0.35,-0.68,-0.36,-0.69,-0.3,6.73,6.6
