# Classification and Feature Engineering with Buzzfeed

Working from [We Trained A Computer To Search For Hidden Spy Planes. This Is What It Found.](https://www.buzzfeednews.com/article/peteraldhous/hidden-spy-planes)

For data, we'll be using `flights_data4-1529539200-cleaned.csv`.

If you'd like to kind of ignore everything I say, you can maybe kind of look at [this page](https://investigate.ai/buzzfeed-spy-planes/feature-engineering-buzzfeed-spy-planes/).

In [1]:
!pip install pyproj
!pip install shapely

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# Normal imports for data analysis
import pandas as pd 
import numpy as np

# Extra geographic bits, only used for for calculating fly zone area
from shapely.geometry import Polygon
from pyproj import Transformer                                                                                                                

%matplotlib inline

In [3]:
df = pd.read_csv('flights_data4-1529539200-cleaned.csv', dtype={'icao24': 'str'})
df.head()

Unnamed: 0,icao24,callsign,day,firstseen,lastseen,estdepartureairport,estarrivalairport,time,altitude,latitude,longitude,heading
0,a09909,,1529539200,1529574000.0,1529574562,KPWK,IN93,1529574561,609.6,41.7021,-86.469939,88.23217
1,a09909,,1529539200,1529574000.0,1529574562,KPWK,IN93,1529574559,609.6,41.702042,-86.47214,88.253716
2,a09909,,1529539200,1529574000.0,1529574562,KPWK,IN93,1529574490,914.4,41.700657,-86.55304,88.451843
3,a09909,,1529539200,1529574000.0,1529574562,KPWK,IN93,1529574466,1219.2,41.703683,-86.587595,111.037514
4,a09909,,1529539200,1529574000.0,1529574562,KPWK,IN93,1529574465,1219.2,41.703962,-86.588681,111.037514


In [4]:
# "How often does the plane start and end at the same airport"
# Break these measurements into individual flights,
# then combine individual flights into individual planes

In [6]:
# This is a single plane
# each plane has an icao24
df[df.icao24 == 'a09909']

Unnamed: 0,icao24,callsign,day,firstseen,lastseen,estdepartureairport,estarrivalairport,time,altitude,latitude,longitude,heading
0,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529574561,609.6,41.702100,-86.469939,88.232170
1,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529574559,609.6,41.702042,-86.472140,88.253716
2,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529574490,914.4,41.700657,-86.553040,88.451843
3,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529574466,1219.2,41.703683,-86.587595,111.037514
4,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529574465,1219.2,41.703962,-86.588681,111.037514
...,...,...,...,...,...,...,...,...,...,...,...,...
81,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529573603,609.6,42.153982,-87.919368,347.686951
82,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529573587,609.6,42.140076,-87.914845,345.191620
83,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529573582,609.6,42.136432,-87.913492,343.141602
84,a09909,,1529539200,1.529574e+09,1529574562,KPWK,IN93,1529573571,304.8,42.128357,-87.910038,341.104858


In [7]:
df[df.icao24 == 'a09909'].altitude.median()

1524.0

In [12]:
df[df.icao24 == 'a09909'].altitude.describe()

count      86.000000
mean     1601.972093
std       720.053997
min       304.800000
25%      1219.200000
50%      1524.000000
75%      1828.800000
max      3048.000000
Name: altitude, dtype: float64

In [8]:
df.groupby('icao24').altitude.median()

icao24
000001      914.4
009f55     7620.0
00b0ec     2743.2
00b1fa     1524.0
00b208     3048.0
           ...   
ef3c61      609.6
f5cb1a     1524.0
f9d82d     1828.8
fab184     4267.2
ffe3bf    11582.4
Name: altitude, Length: 13840, dtype: float64

In [11]:
# Durations of each flight
flight_times = df.groupby(['icao24', 'firstseen']).lastseen.last().reset_index(level=1)
durations = flight_times.lastseen - flight_times.firstseen
durations

icao24
000001      894.0
000001    30940.0
000001    15530.0
009f55     1695.0
00b0ec    53862.0
           ...   
f9d82d     1316.0
f9d82d      897.0
f9d82d     1440.0
fab184      784.0
ffe3bf     1812.0
Length: 30929, dtype: float64

In [13]:
quantiles = df.altitude.quantile([0.2, 0.4, 0.6, 0.8])
quantiles

0.2      914.4
0.4     1828.8
0.6     4572.0
0.8    10058.4
Name: altitude, dtype: float64

In [16]:
# All of the altitude measurements for plan a09909
altitudes = df[df.icao24 == 'a09909'].altitude
altitudes

0      609.6
1      609.6
2      914.4
3     1219.2
4     1219.2
       ...  
81     609.6
82     609.6
83     609.6
84     304.8
85     304.8
Name: altitude, Length: 86, dtype: float64

In [19]:
# Create buckets based on the quantiles of all planes,
# reusing our variable from above
bin_quantiles = [-np.inf, *quantiles, np.inf]

# altitude1: -infinity to 914.4
# altitude2: 914.4     to 1,828.8
# altitude3: 1,828.8   to 4,572.0
# altitude4: 4,572.0   to 10,058.4
# altitude5: 10,058.4  to infinity
bin_labels = ['altitude1','altitude2','altitude3','altitude4','altitude5']

# Get the altitudes for just this plane
altitudes = df[df.icao24 == 'a4f179'].altitude

# Count how many altitude reading are in each bucket
# normalize=True to give us percentages
# bins=bin_labels to name them
# sort=False so we can see them in order of 1-5
# * 100 to convert to percentages because it's easier to read
pd.cut(altitudes, 
       bins=bin_quantiles,
       labels=bin_labels).value_counts(sort=False, normalize=True) * 100

altitude1    18.614021
altitude2    81.385979
altitude3     0.000000
altitude4     0.000000
altitude5     0.000000
Name: altitude, dtype: float64

In [21]:
# Create buckets based on the quantiles of all planes,
# reusing our variable from above
bin_quantiles = [-np.inf, *quantiles, np.inf]

# altitude1: -infinity to 914.4
# altitude2: 914.4     to 1,828.8
# altitude3: 1,828.8   to 4,572.0
# altitude4: 4,572.0   to 10,058.4
# altitude5: 10,058.4  to infinity
bin_labels = ['altitude1','altitude2','altitude3','altitude4','altitude5']

# Get the altitudes for just this plane
altitudes = df[df.icao24 == 'a09909'].altitude

# Count how many altitude reading are in each bucket
# normalize=True to give us percentages
# bins=bin_labels to name them
# sort=False so we can see them in order of 1-5
# * 100 to convert to percentages because it's easier to read
pd.cut(altitudes, 
       bins=bin_quantiles,
       labels=bin_labels).value_counts(sort=False, normalize=True) * 100

altitude1     8.139535
altitude2    70.930233
altitude3    20.930233
altitude4     0.000000
altitude5     0.000000
Name: altitude, dtype: float64