In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
sns.set()

In [6]:
df = pd.read_csv('../input/airplane-crashes-since-1908/Airplane_Crashes_and_Fatalities_Since_1908.csv')

In [7]:
df.head(2)

In [8]:
df.drop(['Registration', 'cn/In', 'Flight #', 'Summary'], axis= 1, inplace= True)

In [9]:
df.head()

In [10]:
df.shape

In [11]:
df.duplicated().sum()

In [12]:
df.dtypes

## Data Cleaning.

In [13]:
df.Location.replace(np.nan, 'Unknown', inplace= True)

In [14]:
df.Operator.replace(np.nan, 'Unknown', inplace= True)

In [15]:
df.Route.replace(np.nan, 'Unknown', inplace= True)

In [16]:
df.Type.replace(np.nan, 'Unknown', inplace= True)

In [17]:
df['Time'] = df['Time'].fillna('00:00')

In [18]:
df.dropna(inplace= True)

In [19]:
df.isna().sum()

In [20]:
df['Time'] = df['Time'].str.replace('c: ', '')
df['Time'] = df['Time'].str.replace('c:', '')
df['Time'] = df['Time'].str.replace('c', '')
df['Time'] = df['Time'].str.replace('12\'20', '12:20')
df['Time'] = df['Time'].str.replace('18.40', '18:40')
df['Time'] = df['Time'].str.replace('0943', '09:43')
df['Time'] = df['Time'].str.replace('22\'08', '22:08')
df['Time'] = df['Time'].str.replace('114:20', '00:00')

In [21]:
df['Ground'] = df['Aboard'] - df['Fatalities']

In [22]:
df['DateTime'] = df['Date'] + ' ' + df['Time']

In [23]:
df['DateTime'] = pd.to_datetime(df['DateTime'])

In [48]:
df.head(10)

In [25]:
df.Operator.value_counts()[:50]

In [26]:
df.Route.value_counts()

In [27]:
df.Type.value_counts()

In [28]:
df.loc[df['Operator'].str.contains('Military'), 'Sector'] = 'Military'

In [29]:
df.loc[(df['Operator'].str.contains('Air')) & (~df['Operator'].str.contains('Military')), 'Sector'] = 'Civil'

In [30]:
df.loc[(df['Operator'].str.contains('Aero') & (~df['Operator'].str.contains('Military'))), 'Sector'] = 'Civil'

In [31]:
df.loc[df['Operator'].str.contains('Mail'), 'Sector'] = 'Mail Service'

In [32]:
df.loc[df['Operator'].str.contains('Private'), 'Sector'] = 'Private'

In [33]:
df.loc[df['Sector']==False, 'Sector'] = 'Civil'

# Basic EDA.

In [34]:
plt.figure(figsize= (18, 8))
years = df['DateTime'].dt.year
_= plt.plot(df['Fatalities'], years, 'o')
_= plt.xlabel('Fatalities')
_= plt.ylabel('Years')
plt.show()

In [35]:
year_count = df.groupby(df.DateTime.dt.year)[['Date']].count()

plt.figure(figsize= (18, 8))
plt.plot(year_count.index, year_count['Date'], marker= '.')

plt.xlabel('Years')
plt.ylabel('Total Accidents')
plt.show()

## Distribution of fatalities by month.

In [36]:
month_count = df.groupby(df.DateTime.dt.month)[['Date']].count()

plt.figure(figsize= (16, 8))
plt.plot(range(len(month_count)), month_count['Date'], marker= '.', color= 'orange')
plt.xticks(range(len(month_count)), ['Jan', 'Feb', 'March', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlabel('Month')
plt.ylabel('Count of Accidents')
plt.show()

## Distribution of fatalities by day of month.

In [37]:
day_count = df.groupby(df.DateTime.dt.day)[['Date']].count()

plt.figure(figsize= (16, 8))
sns.barplot(day_count.index, day_count['Date'], color ='lightskyblue')
plt.xlabel('Day')
plt.ylabel('Total Accidents');

In [38]:
df['Day'] = df.DateTime.dt.day_name()
day = df.groupby('Day')[['Fatalities', 'Aboard']].sum()
day['Survival Ratio'] = (day['Aboard'] - day['Fatalities']) / day['Aboard']


In [39]:

_,  ax = plt.subplots(figsize=(17,10))
ax.plot(day.index, day['Fatalities'], marker= '.')
ax2 = ax.twinx()
ax2.plot(day.index, day['Survival Ratio'], marker= '.', linestyle= '--', color= 'red')

plt.xlabel('Day of Week')
ax.set_xticklabels(['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
ax.set_ylabel('Total Fatalities')
ax2.set_ylabel('Survival Ratio')
plt.show()

## Distribution of accidents by sector.

In [40]:
sector = df.groupby('Sector')[['Fatalities']].sum().reset_index()

In [41]:
plt.figure(figsize= (16, 10))
plt.pie(sector['Fatalities'], labels= sector['Sector'],radius= 1.2, shadow= True, autopct= '%.2f');

## Poisson distribution of days passed between accidents.

In [42]:
df = df.sort_values('DateTime').reset_index()

In [43]:
timedelta = []
n= 0
for i in range(len(df)-1):
    timedelta.append(df['DateTime'][n+1] - df['DateTime'][n])
    n+= 1

In [49]:
df.head(10)

In [50]:
sns.boxplot(x='Fatalities',y='Aboard',data=df,palette='rainbow')

In [44]:
days = []
for i in timedelta :
	days.append(i.days)
days.append(0)
df['Time Since'] = days 

In [52]:
sns.countplot(x='Fatalities',data=df)

In [45]:
plt.figure(figsize= (16, 8))
plt.xlim(0, 85)
df['Time Since'].hist(bins= 1600);

In [46]:
def  ecdf(data) :
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n+1) / n
    
    return x, y

In [47]:
x, y = ecdf(df['Time Since'])

plt.figure(figsize= (16, 8))
plt.xlim(0, 200)
_= plt.plot(x, y, marker='.', linestyle= 'none')
plt.margins(y= 0.03)
_= plt.xlabel('Days Passed')
_= plt.ylabel('ECDF')
plt.show()