# Data Analysis Notebook

In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')
%matplotlib inline
from IPython.display import Markdown
    

In [None]:


def theme1():
    plt.style.use('seaborn-v0_8-whitegrid')
    custom_params = {
        'figure.figsize': (15, 6),
        'font.size': 16,
        'font.weight': 'bold',
        'axes.titlesize': 20,
        'axes.labelsize': 18,
        'xtick.labelsize': 14,
        'ytick.labelsize': 14,
        'axes.spines.top': False,
        'axes.spines.right':False,

    }
    plt.rcParams.update(custom_params)
    sns.set_palette(["#c7522a","#fbf2c4","#008585", "#003f5c", "#58508d", "#ffa600" , "#660e60","#6c584c"])
    
    
def theme2():
    plt.style.use('seaborn-v0_8-dark-palette')
    custom_params = {
        'figure.figsize': (15, 6),
        'font.size': 16,
        'font.weight': 'bold',
        'axes.titlesize': 20,
        'axes.labelsize': 18,
        'xtick.labelsize': 14,
        'ytick.labelsize': 14,
        'axes.spines.top': False,
        'axes.spines.right':False,
    }
    plt.rcParams.update(custom_params)
    sns.set_palette(["#c7522a","#fbf2c4","#008585", "#003f5c", "#58508d", "#ffa600", "#660e60","#6c584c"])
theme1()



def doughnuts(data):
    # Create a pieplot
    plt.pie(x= data.values , labels=data.index , autopct='%.2f%%',shadow=True , startangle=90)
    plt.axis('equal')
    # plt.legend(loc='upper right')
    # add a circle at the center to transform it in a donut chart
    my_circle=plt.Circle( (0,0), 0.7, color='white')
    p=plt.gcf()
    p.gca().add_artist(my_circle)

    return p




    

In [None]:
df = pd.read_csv(r'/home/fever/work/Data sets/Airline/train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:

    
def null_percent(col):
    percentage = round(col.isnull().sum()/len(col)*100,2)
    if percentage > 0:
        print(f'{col.name}==>{percentage} |', end=' ')
print('+'*46)
df.apply(null_percent)
print()
print('+'*46)
fig, ax = plt.subplots(figsize=(15, 7))
sns.heatmap(df.isnull(), yticklabels=False, cbar=False)


    

## Univariate Analysis

## Numerical Features

> Unnamed: 0

In [None]:
df['Unnamed: 0'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Unnamed: 0', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Unnamed: 0'],ax=ax['box'])
plt.title('Unnamed: 0 Histogram')
plt.show()
        

> id

In [None]:
df['id'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='id', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['id'],ax=ax['box'])
plt.title('id Histogram')
plt.show()
        

> Age

In [None]:
df['Age'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Age', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Age'],ax=ax['box'])
plt.title('Age Histogram')
plt.show()
        

> Flight Distance

In [None]:
df['Flight Distance'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Flight Distance', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Flight Distance'],ax=ax['box'])
plt.title('Flight Distance Histogram')
plt.show()
        

> Inflight wifi service

In [None]:
df['Inflight wifi service'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Inflight wifi service', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Inflight wifi service'],ax=ax['box'])
plt.title('Inflight wifi service Histogram')
plt.show()
        

> Departure/Arrival time convenient

In [None]:
df['Departure/Arrival time convenient'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Departure/Arrival time convenient', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Departure/Arrival time convenient'],ax=ax['box'])
plt.title('Departure/Arrival time convenient Histogram')
plt.show()
        

> Ease of Online booking

In [None]:
df['Ease of Online booking'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Ease of Online booking', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Ease of Online booking'],ax=ax['box'])
plt.title('Ease of Online booking Histogram')
plt.show()
        

> Gate location

In [None]:
df['Gate location'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Gate location', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Gate location'],ax=ax['box'])
plt.title('Gate location Histogram')
plt.show()
        

> Food and drink

In [None]:
df['Food and drink'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Food and drink', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Food and drink'],ax=ax['box'])
plt.title('Food and drink Histogram')
plt.show()
        

> Online boarding

In [None]:
df['Online boarding'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Online boarding', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Online boarding'],ax=ax['box'])
plt.title('Online boarding Histogram')
plt.show()
        

> Seat comfort

In [None]:
df['Seat comfort'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Seat comfort', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Seat comfort'],ax=ax['box'])
plt.title('Seat comfort Histogram')
plt.show()
        

> Inflight entertainment

In [None]:
df['Inflight entertainment'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Inflight entertainment', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Inflight entertainment'],ax=ax['box'])
plt.title('Inflight entertainment Histogram')
plt.show()
        

> On-board service

In [None]:
df['On-board service'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='On-board service', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['On-board service'],ax=ax['box'])
plt.title('On-board service Histogram')
plt.show()
        

> Leg room service

In [None]:
df['Leg room service'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Leg room service', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Leg room service'],ax=ax['box'])
plt.title('Leg room service Histogram')
plt.show()
        

> Baggage handling

In [None]:
df['Baggage handling'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Baggage handling', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Baggage handling'],ax=ax['box'])
plt.title('Baggage handling Histogram')
plt.show()
        

> Checkin service

In [None]:
df['Checkin service'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Checkin service', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Checkin service'],ax=ax['box'])
plt.title('Checkin service Histogram')
plt.show()
        

> Inflight service

In [None]:
df['Inflight service'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Inflight service', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Inflight service'],ax=ax['box'])
plt.title('Inflight service Histogram')
plt.show()
        

> Cleanliness

In [None]:
df['Cleanliness'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Cleanliness', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Cleanliness'],ax=ax['box'])
plt.title('Cleanliness Histogram')
plt.show()
        

> Departure Delay in Minutes

In [None]:
df['Departure Delay in Minutes'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Departure Delay in Minutes', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Departure Delay in Minutes'],ax=ax['box'])
plt.title('Departure Delay in Minutes Histogram')
plt.show()
        

> Arrival Delay in Minutes

In [None]:
df['Arrival Delay in Minutes'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','box']])
sns.histplot(data=df, x='Arrival Delay in Minutes', bins=30, kde=True, ax=ax['hist'])
sns.boxplot(df['Arrival Delay in Minutes'],ax=ax['box'])
plt.title('Arrival Delay in Minutes Histogram')
plt.show()
        

## Categorical Features

> Gender

In [None]:
df['Gender'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','doughnut']])
g= df['Gender'].value_counts()
sns.barplot(x=g.index , y=g.values, ax=ax['hist'])
ax['doughnut']=doughnuts(g)
plt.show()

        

> Customer Type

In [None]:
df['Customer Type'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','doughnut']])
g= df['Customer Type'].value_counts()
sns.barplot(x=g.index , y=g.values, ax=ax['hist'])
ax['doughnut']=doughnuts(g)
plt.show()

        

> Type of Travel

In [None]:
df['Type of Travel'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','doughnut']])
g= df['Type of Travel'].value_counts()
sns.barplot(x=g.index , y=g.values, ax=ax['hist'])
ax['doughnut']=doughnuts(g)
plt.show()

        

> Class

In [None]:
df['Class'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','doughnut']])
g= df['Class'].value_counts()
sns.barplot(x=g.index , y=g.values, ax=ax['hist'])
ax['doughnut']=doughnuts(g)
plt.show()

        

> satisfaction

In [None]:
df['satisfaction'].describe()

In [None]:

fig,ax = plt.subplot_mosaic([['hist','hist','doughnut']])
g= df['satisfaction'].value_counts()
sns.barplot(x=g.index , y=g.values, ax=ax['hist'])
ax['doughnut']=doughnuts(g)
plt.show()

        

## Numerical Features With Errors