#  Flights

### Load needed library

In [46]:
import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings('ignore')

### Import our Data

In [47]:
df = pd.read_csv('/kaggle/input/flights/flights.csv')

# Data Overview

In [48]:
df.head(100)

In [49]:
df.shape

### know the number of columns and row of our dataset

In [50]:
df.info()

In [51]:
df.dtypes

### know the type of every column

In [52]:
df.describe()

### Descriptive stats for our data

In [53]:
df.isna().sum()

### Check if there is missing data

In [54]:
df.duplicated().sum()

### we do not have any deuplicated rows 

In [55]:
df.nunique().to_frame().rename(columns={0:'Count'})

### check the uniqueness of our columns row

In [56]:
df['carrier'].unique()

### know how many carriers that we have in our dataset 

In [57]:
df['year'].unique()

In [58]:
df.day.describe()

In [59]:
print(df['carrier'].value_counts())

In [60]:
print(['WN: Southwest Airlines', 'AA: American Airlines', 'MQ: American Eagle Airlines', 'UA: United Airlines',
       'OO: Skywest Airlines','DL: Delta Airlines','US: US Airways',
       'EV: Atlantic Southeast Airlines','FL: AirTran Airways','YV: Mesa Airlines', 
       'B6: JetBlue Airways','9E: Pinnacle Airlines','AS: Alaska Airlines','F9: Frontier Airlines',
       'HA: Hawaiian Airlines'])

### every airline and its frequency journies

# Data cleaning

### see the missing data

In [61]:
missing_data = df.isnull().sum(axis=0).reset_index()
missing_data.columns = ['variable', 'missing values']
missing_data['filling factor (%)']=(df.shape[0]-missing_data['missing values'])/df.shape[0]*100
missing_data.sort_values('filling factor (%)').reset_index(drop = True)

### we see that those columns (air_time, arr_delay, arr_time, dep_time,dep_delay) have some missing data that we must handel

In [62]:
df=df.dropna()

### remove any row with at least one missing value

In [63]:
df.isna().sum()

###  now no nulls

In [64]:
cols=["day","month","year"]
df['date'] = df[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")

### join the coulmns day month year  to one colmun to be the date of our data

### Setting the Frequency

In [65]:
df.set_index("date", inplace=True)
df.head()

### our data now sorted by the date

# Exploratory Data Analysis

In [66]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [67]:
#correlation matrix
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()

### this heatmap give us an intuation for the corrolation between our dataset columns which indicates that so coluns have very strong correlation like arrival time and departure time, other have so low correlation like arrival delay and distance.

In [68]:
delay_type = lambda x:((0,1)[x > 5],2)[x > 45]

fig = plt.figure(1, figsize=(10,7))
ax = sns.countplot(y="carrier", hue='year', data=df)

# Setting Labels
plt.setp(ax.get_xticklabels(), fontsize=12, weight = 'normal', rotation = 0);
plt.setp(ax.get_yticklabels(), fontsize=12, weight = 'bold', rotation = 0);
ax.yaxis.label.set_visible(False)
plt.xlabel('Flight count', fontsize=16, weight = 'bold', labelpad=10)


### this count plot give us an observiation on which is the most airlines have made flights in 2013
### we find that it is UA and B6 have most flights for 2014 where is very low flights for OO(SKYWEST)

In [69]:
#Status  on time (0),
#slightly delayed (1),
#highly delayed (2),
for dataset in df:
    df.loc[df['arr_delay'] <= 10, 'Status'] = 0
    df.loc[df['arr_delay'] >= 10, 'Status'] = 1
    df.loc[df['arr_delay'] >= 30, 'Status'] = 2


In [70]:
f,ax=plt.subplots(1,2,figsize=(20,8))
df['Status'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Status')
ax[0].set_ylabel('')
sns.countplot('Status',order = df['Status'].value_counts().index, data=df,ax=ax[1])
ax[1].set_title('Status')
plt.show()

### In 2013, a 71% of flights were delayed by more than 10 minutes. 12.9% of flights  had delays of more than 10 min and less than half hour.On the other hand,  16.6% above hour

In [71]:
delay = df[(df.Status >= 1) &(df.Status < 3)]
#histogram
sns.distplot(delay['arr_delay'])
plt.show()

### It can be seen that delays are mostly located on the left side of the graph,The most of delays are short, and unusual we have very large delay

In [72]:
fig = plt.figure(figsize=(20,8))
delay[['month','arr_delay']].groupby(['month']).mean().plot()
plt.show()

### Delays focused on February, June and December, might the cause of the sumer and winter holidays 

In [73]:
fig = plt.figure(figsize=(20,8))
delay[['hour','arr_delay']].groupby(['hour']).mean().plot()
plt.show()

### it is clear the the delays is rush between the 17:21 hour

In [74]:
carrier_delay = df[['hour','carrier']].groupby(['carrier']).head()
carrier_delay

In [75]:
df.arr_delay.plot(figsize=(20,5))
plt.title("delays over 2013", size = 24)
plt.ylim(0,1400)
plt.show()

In [76]:
f,ax=plt.subplots(1,figsize=(20,8))
sns.barplot('carrier','arr_delay', data=delay,ax=ax, 
            order=['WN', 'AA','B6','AS', 'MQ',
                   'UA','OO','DL','US','EV','FL',
                   'YV', '9E','F9','HA'])


### We find the the Airlines 'OO: Skywest Airlines', 'YV: Mesa Airlines','9E: Pinnacle Airlines', and 'EV: Atlantic Southeast Airlines' have the most delays time along all the dataset. in other way the Airlines  'UA: United Airlines', 'AS: Alaska Airlines' have the least time delay over all carriers.



# Time series forecasting with  ARIMA models.


In [77]:
UA_delay = df.loc[df['carrier'] == 'UA', 'arr_delay']


### We sliced our data to make the model more accurate so i will take the UA airlines to detect the delays for it.the reason why i chose the UA airlines as we seen before from the visualization it is the most airlines have number of travels in the year so we intersited on it.

In [78]:
type(UA_delay)

In [79]:
UA_delay = pd.DataFrame(UA_delay)
UA_delay.head()

In [80]:
UA_delay.shape

In [81]:
UA_delay=UA_delay.loc[['1-1-2013','2-1-2013','3-1-2013','4-1-2013','5-1-2013','6-1-2013','7-1-2013']]

### working on week of the UA airlines delays

### convert the data series of the UA Delay to datafram to workon it.

### Splitting the Data

In [82]:
from sklearn.model_selection import train_test_split
df, df_test = train_test_split(UA_delay, test_size=0.3, random_state=44, shuffle =True)

In [83]:
df.shape

### 70% of the data will be for trianing and 30% for test

In [84]:
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm

model=sm.tsa.statespace.SARIMAX(df['arr_delay'],order=(1, 1, 1),seasonal_order=(1,1,1,12))
results=model.fit()
df['forecast']=results.predict()
df[['arr_delay','forecast']].plot(figsize=(12,8))

### the forcasting delay vs. actual delay

In [85]:
results.summary()

In [86]:
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.arima_model import ARIMA

In [87]:
# For non-seasonal data
#p=1, d=1, q=0 or 1

model2=sm.tsa.arima.ARIMA(df['arr_delay'],order=(1,1,1))
model_fit2=model2.fit()
model_fit2.summary()

In [88]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df['arr_delay'])
plt.show()

### Time Series forecasting is really useful when we have to take future decisions or we have to do analysis, we can quickly do that using ARIMA, there are lots of other Models from we can do the time series forecasting but ARIMA is really easy to understand.