In [1]:
# Ryan Cummings, CMPE 255: Preliminary Analysis, Nov 2 2021.

### Libraries:

In [2]:
import pandas as pd

# Loading in the Data

In [32]:
data = pd.read_csv("AviationData.txt", sep="|")

In [34]:
data.head()

Unnamed: 0,Event Id,Investigation Type,Accident Number,Event Date,Location,Country,Latitude,Longitude,Airport Code,Airport Name,...,Air Carrier,Total Fatal Injuries,Total Serious Injuries,Total Minor Injuries,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date,Unnamed: 21
0,20131208X92103,Accident,ERA14FA068,12/08/2013,"Jacksonville, FL",United States,30.319723,-81.514167,CRG,Jacksonville Executive Airport,...,,3.0,,,,IMC,,Preliminary,12/13/2013,
1,20131206X95526,Accident,ERA14FA066,12/06/2013,"Sebring, FL",United States,27.463333,-81.366667,SEF,Sebring Regional Airport,...,,1.0,,,,,,Preliminary,12/13/2013,
2,20131205X60841,Accident,ANC14CA010,12/04/2013,"Soldotna, AK",United States,,,,,...,,,,,,,,Preliminary,,
3,20131204X65412,Accident,ERA14CA062,12/03/2013,"Hendersonville, NC",United States,,,,,...,,,,,,,,Preliminary,,
4,20131202X34203,Accident,ERA14LA059,12/02/2013,"Trenton, SC",United States,33.736944,-81.818611,6J6,Edgefield County Airport,...,,,,1.0,,,,Preliminary,12/13/2013,


In [35]:
data.columns

Index(['Event Id ', ' Investigation Type ', ' Accident Number ',
       ' Event Date ', ' Location ', ' Country ', ' Latitude ', ' Longitude ',
       ' Airport Code ', ' Airport Name ', ' Injury Severity ',
       ' Aircraft Damage ', ' Aircraft Category ', ' Registration Number ',
       ' Make ', ' Model ', ' Amateur Built ', ' Number of Engines ',
       ' Engine Type ', ' FAR Description ', ' Schedule ',
       ' Purpose of Flight ', ' Air Carrier ', ' Total Fatal Injuries ',
       ' Total Serious Injuries ', ' Total Minor Injuries ',
       ' Total Uninjured ', ' Weather Condition ', ' Broad Phase of Flight ',
       ' Report Status ', ' Publication Date ', ' '],
      dtype='object')

### Right off the bat we can take a look at the columns and infer on which columns can be dropped for being unnecessary. From looking at the data some columns that we can drop are: Event ID, Accident Number, Registration Number, FAR Description. 

In [36]:
data.drop(['Event Id ', ' Accident Number ', ' Registration Number ', ' FAR Description '], inplace = True, axis=1)

In [37]:
data.head()

Unnamed: 0,Investigation Type,Event Date,Location,Country,Latitude,Longitude,Airport Code,Airport Name,Injury Severity,Aircraft Damage,...,Air Carrier,Total Fatal Injuries,Total Serious Injuries,Total Minor Injuries,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date,Unnamed: 21
0,Accident,12/08/2013,"Jacksonville, FL",United States,30.319723,-81.514167,CRG,Jacksonville Executive Airport,Fatal(3),Substantial,...,,3.0,,,,IMC,,Preliminary,12/13/2013,
1,Accident,12/06/2013,"Sebring, FL",United States,27.463333,-81.366667,SEF,Sebring Regional Airport,Fatal(1),Destroyed,...,,1.0,,,,,,Preliminary,12/13/2013,
2,Accident,12/04/2013,"Soldotna, AK",United States,,,,,,,...,,,,,,,,Preliminary,,
3,Accident,12/03/2013,"Hendersonville, NC",United States,,,,,,,...,,,,,,,,Preliminary,,
4,Accident,12/02/2013,"Trenton, SC",United States,33.736944,-81.818611,6J6,Edgefield County Airport,Non-Fatal,Substantial,...,,,,1.0,,,,Preliminary,12/13/2013,


### Reasoning for Dropping:
### - Event Id -> Dropping because it was a unique identifier for the instance (accident/incident).  This would not be helpful for any type of analysis 
### - Accident Number -> Similar to above, I removed it because it was a unique identifier for the accident and it would not be helpful for analysis. 
### - Registration Number -> This was the registration number associated with the aircraft. Again, not useful since it is a unique identifier.
### - FAR Description -> 

# Data Preprocessing

The next thing that I want to do is get the data prepped to be used in possible algorithms such as: Regression, Decision Trees, and Clustering. Feature selection and prepperation is important because key insights can be missed because of uncleaned data. For example, the first thing that I want to do is seperate the Date column into 3 columns: Month, Day, Year. I feel that this is important because we are looking at aircraft accidents and there may be more accidents during the winter time of year. So extracting month, in this example, is important because if we are going to build some models this feature may be a key influencer. 

Another feature that I want to extract more info from is the column: Injury Severity. This column consists of the type of injury that occured, in this case it has the values: Fatal, Non-Fatal, Incident, and Unavailable.

In [42]:
data[' Injury Severity '].unique()

array([' Fatal(3) ', ' Fatal(1) ', '  ', ' Non-Fatal ', ' Unavailable ',
       ' Fatal(4) ', ' Incident ', ' Fatal(2) ', ' Fatal(5) ',
       ' Fatal(14) ', ' Fatal(7) ', ' Fatal(10) ', ' Fatal(6) ',
       ' Fatal(9) ', ' Fatal(8) ', ' Fatal(19) ', ' Fatal(153) ',
       ' Fatal(127) ', ' Fatal(28) ', ' Fatal(11) ', ' Fatal(77) ',
       ' Fatal(12) ', ' Fatal(21) ', ' Fatal(42) ', ' Fatal(157) ',
       ' Fatal(158) ', ' Fatal(103) ', ' Fatal(89) ', ' Fatal(90) ',
       ' Fatal(152) ', ' Fatal(228) ', ' Fatal(17) ', ' Fatal(13) ',
       ' Fatal(50) ', ' Fatal(24) ', ' Fatal(88) ', ' Fatal(65) ',
       ' Fatal(154) ', ' Fatal(30) ', ' Fatal(20) ', ' Fatal(40) ',
       ' Fatal(57) ', ' Fatal(199) ', ' Fatal(114) ', ' Fatal(23) ',
       ' Fatal(102) ', ' Fatal(96) ', ' Fatal(49) ', ' Fatal(124) ',
       ' Fatal(107) ', ' Fatal(117) ', ' Fatal(145) ', ' Fatal(45) ',
       ' Fatal(160) ', ' Fatal(121) ', ' Fatal(16) ', ' Fatal(15) ',
       ' Fatal(104) ', ' Fatal(25) ', ' Fatal(5