# Overview

In [1]:
import pandas as pd

# load csv file
air1 = pd.read_csv("data/airline1.csv")
air2 = pd.read_csv("data/airline2.csv")

# merge dataframes
og_data = pd.concat([air1, air2], ignore_index=True)

# save to csv
og_data.to_csv("data/airline_merged.csv")


In [2]:
# drop first column (unnamed)
og_data = og_data.drop(columns=["Unnamed: 0"])

# apply .title to each column name
og_data.columns = og_data.columns.str.title()

# rename 'Id' column to 'ID'
og_data = og_data.rename(columns={"Id": "ID"})

og_data


Unnamed: 0,ID,Gender,Customer Type,Age,Type Of Travel,Class,Flight Distance,Inflight Wifi Service,Departure/Arrival Time Convenient,Ease Of Online Booking,...,Inflight Entertainment,On-Board Service,Leg Room Service,Baggage Handling,Checkin Service,Inflight Service,Cleanliness,Departure Delay In Minutes,Arrival Delay In Minutes,Satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,...,4,3,2,4,4,5,4,0,0.0,neutral or dissatisfied
129876,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,...,4,4,5,5,5,5,4,0,0.0,satisfied
129877,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,2,4,3,4,5,4,2,0,0.0,neutral or dissatisfied
129878,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,...,4,3,2,5,4,5,4,0,0.0,satisfied


In [3]:
# show how many rows with Satisfaction = 'satisfied' and 'neutral or dissatisfied'
og_data["Satisfaction"].value_counts()


Satisfaction
neutral or dissatisfied    73452
satisfied                  56428
Name: count, dtype: int64

In [4]:
# split data into satisfied and non-satisfied
not_satisfied = og_data[og_data["Satisfaction"] != "satisfied"]

# delete rows with 'neutral or dissatisfied' from og_data
og_data = og_data[og_data["Satisfaction"] == "satisfied"]

# drop first 13854 rows from not_satisfied
not_satisfied = not_satisfied.iloc[17024:]
not_satisfied["Satisfaction"].value_counts()


Satisfaction
neutral or dissatisfied    56428
Name: count, dtype: int64

In [5]:
# merge og_data and not_satisfied
og_data = pd.concat([og_data, not_satisfied])
og_data["Satisfaction"].value_counts()


Satisfaction
satisfied                  56428
neutral or dissatisfied    56428
Name: count, dtype: int64

In [6]:
# save changes to new csv file
og_data.to_csv("data/airline_merged_clean.csv", index=False)


In [7]:
# show nan values
og_data.isnull().sum()


ID                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type Of Travel                         0
Class                                  0
Flight Distance                        0
Inflight Wifi Service                  0
Departure/Arrival Time Convenient      0
Ease Of Online Booking                 0
Gate Location                          0
Food And Drink                         0
Online Boarding                        0
Seat Comfort                           0
Inflight Entertainment                 0
On-Board Service                       0
Leg Room Service                       0
Baggage Handling                       0
Checkin Service                        0
Inflight Service                       0
Cleanliness                            0
Departure Delay In Minutes             0
Arrival Delay In Minutes             350
Satisfaction                           0
dtype: int64

In [8]:
# show number of unique values per col
og_data.nunique()


ID                                   112856
Gender                                    2
Customer Type                             2
Age                                      75
Type Of Travel                            2
Class                                     3
Flight Distance                        3797
Inflight Wifi Service                     6
Departure/Arrival Time Convenient         6
Ease Of Online Booking                    6
Gate Location                             6
Food And Drink                            6
Online Boarding                           6
Seat Comfort                              5
Inflight Entertainment                    6
On-Board Service                          6
Leg Room Service                          6
Baggage Handling                          5
Checkin Service                           5
Inflight Service                          6
Cleanliness                               6
Departure Delay In Minutes              452
Arrival Delay In Minutes        

# Analysis

In [9]:
from helper_functions import extended_describe

data = og_data.copy()


In [10]:
# save extended describe to csv
extended_describe(data).to_csv("data/eda/extended_describe.csv")
extended_describe(data)