In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
get_ipython().run_line_magic('matplotlib', 'inline')

In [2]:
file_name='airports.csv'
df=pd.read_csv(file_name)

In [3]:
df.head() #display the first 5 columns of the dataframe

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,6523,00A,heliport,Total Rf Heliport,40.070801,-74.933601,11.0,,US,US-PA,Bensalem,no,00A,,00A,,,
1,323361,00AA,small_airport,Aero B Ranch Airport,38.704022,-101.473911,3435.0,,US,US-KS,Leoti,no,00AA,,00AA,,,
2,6524,00AK,small_airport,Lowell Field,59.947733,-151.692524,450.0,,US,US-AK,Anchor Point,no,00AK,,00AK,,,
3,6525,00AL,small_airport,Epps Airpark,34.864799,-86.770302,820.0,,US,US-AL,Harvest,no,00AL,,00AL,,,
4,6526,00AR,closed,Newport Hospital & Clinic Heliport,35.6087,-91.254898,237.0,,US,US-AR,Newport,no,,,,,,00AR


In [4]:
df.dtypes
#Display the data types of each column using the attribute dtype

id                     int64
ident                 object
type                  object
name                  object
latitude_deg         float64
longitude_deg        float64
elevation_ft         float64
continent             object
iso_country           object
iso_region            object
municipality          object
scheduled_service     object
gps_code              object
iata_code             object
local_code            object
home_link             object
wikipedia_link        object
keywords              object
dtype: object

In [5]:


df.describe()
#Obtain a statistical summary of the dataframe

Unnamed: 0,id,latitude_deg,longitude_deg,elevation_ft
count,68947.0,68947.0,68947.0,55871.0
mean,135599.402469,25.97645,-31.077357,1287.082422
std,149239.463407,26.293894,84.607771,1648.52632
min,2.0,-90.0,-179.876999,-1266.0
25%,17373.5,12.509784,-94.178001,207.0
50%,37104.0,35.397301,-71.0075,729.0
75%,324260.5,42.917049,19.420111,1585.0
max,349799.0,82.75,179.9757,22000.0


In [6]:
print("number of NaN values for the column description :", df['elevation_ft'].isnull().sum())
#see the missing values for columns where there are blanks


number of NaN values for the column description : 13076


In [7]:
mean=df['elevation_ft'].mean()
df['elevation_ft'].replace(np.nan,mean, inplace=True)
#replacing the missing values of the column 'elevation_ft' with the mean of the column 'elevation_ft'  using the method replace().

In [8]:
df = df.loc[df["type"] != 'closed']
df.shape


(60717, 18)

In [9]:
(df['type']== 'small_airport')

0        False
1         True
2         True
3         True
5         True
         ...  
68941    False
68942    False
68943     True
68944     True
68946     True
Name: type, Length: 60717, dtype: bool

In [10]:
airport_type_index1 = (df['type'] == 'small_airport')
small_airport_data = df[airport_type_index1]
small_airport_data['elevation_ft'].value_counts()[:15]

1287.082422    5080
10.000000       209
20.000000       185
1000.000000     139
30.000000       133
800.000000      122
50.000000       110
16.000000       107
200.000000      105
33.000000       103
820.000000      103
1050.000000      98
700.000000       97
750.000000       96
600.000000       94
400.000000       93
66.000000        93
100.000000       91
250.000000       91
13.000000        90
1100.000000      87
7.000000         86
3.000000         85
850.000000       85
15.000000        83
Name: elevation_ft, dtype: int64

In [11]:
airport_type_index2 = (df['type'] == 'medium_airport')
medium_airport_data = df[airport_type_index2]
airport_type_index3 = (df['type'] == 'large_airport')
large_airport_data = df[airport_type_index3]
large_airport_data['elevation_ft'].value_counts()[:15]

13.0    12
10.0     8
30.0     7
9.0      6
16.0     6
20.0     6
15.0     6
Name: elevation_ft, dtype: int64

In [12]:
df['type'].value_counts()

small_airport     37606
heliport          16799
medium_airport     4556
seaplane_base      1096
large_airport       624
balloonport          36
Name: type, dtype: int64

In [13]:
df['small_airport'] = 'overwritten' # the value for all rows
df['medium_airport'] = 'overwritten' # the value for all rows
df['large_airport'] = 'overwritten' # the value for all rows

In [14]:
airport_type = ['small_airport','medium_airport','large_airport']
def filter_airport_type(type):
    if type in airport_type:
        return 'small_airport'
    elif type == 'medium_airport':
        return 'medium_airport'
    else:
        return 'large_airport'
df['type'].apply(filter_airport_type)

0        large_airport
1        small_airport
2        small_airport
3        small_airport
5        small_airport
             ...      
68941    small_airport
68942    large_airport
68943    small_airport
68944    small_airport
68946    small_airport
Name: type, Length: 60717, dtype: object

In [15]:
airport_type1 = ['small_airport']
airport_type2 = ['medium_airport']
airport_type3 = ['large_airport']
def filter_airport_type1(type):
    if type in airport_type1:
        return 'small_airport'
df['type'].apply(filter_airport_type1)
df['airport_type_small'] = df['type'].apply(filter_airport_type1)
def filter_airport_type2(type):
    if type in airport_type2:
        return 'medium_airport'
df['type'].apply(filter_airport_type2)
df['airport_type_medium'] = df['type'].apply(filter_airport_type2)
def filter_airport_type3(type):
    if type in airport_type3:
        return 'large_airport'
df['type'].apply(filter_airport_type3)
df['airport_type_large'] = df['type'].apply(filter_airport_type3)
df[['type','airport_type_small','airport_type_medium','airport_type_large']][350:373] # rows with different values to make sure it worked

Unnamed: 0,type,airport_type_small,airport_type_medium,airport_type_large
392,heliport,,,
393,small_airport,small_airport,,
394,heliport,,,
395,seaplane_base,,,
396,heliport,,,
397,heliport,,,
398,small_airport,small_airport,,
399,small_airport,small_airport,,
400,small_airport,small_airport,,
401,heliport,,,


In [16]:
file_name='airport-frequencies.csv'
dfAF=pd.read_csv(file_name)

In [17]:
df1 = pd.DataFrame({'key': df['id'],
                   'A': df['airport_type_large']})
df1


Unnamed: 0,key,A
0,6523,
1,323361,
2,6524,
3,6525,
5,322127,
...,...,...
68941,32753,
68942,46378,
68943,307326,
68944,346788,


In [18]:
df2 = pd.DataFrame({'key': dfAF['airport_ref'],
                   'B': dfAF['frequency_mhz']})
df2 = df2.loc[df2["B"] >100]
df2['B'].mean()
df2['B'].median()
df2['B'].mode(dropna=True)

0    122.8
Name: B, dtype: float64

In [19]:
df3 = pd.DataFrame({'key': df['airport_type_large'],
                   'B': dfAF['frequency_mhz']})
df3 = df3.loc[df3["key"] == 'large_airport']
df3['B'].mean()
df3['B'].median()
df3['B'].mode(dropna=True)

0    118.1
1    125.9
Name: B, dtype: float64

In [20]:
#cannot apply the describe method to the join, to obtain the statistical details required

In [21]:
df3.join(df3, lsuffix='_caller', rsuffix='_other') 

Unnamed: 0,key_caller,B_caller,key_other,B_other
10907,large_airport,128.10,large_airport,128.10
12490,large_airport,122.55,large_airport,122.55
13026,large_airport,122.95,large_airport,122.95
13073,large_airport,122.80,large_airport,122.80
17304,large_airport,123.05,large_airport,123.05
...,...,...,...,...
68836,large_airport,,large_airport,
68909,large_airport,,large_airport,
68920,large_airport,,large_airport,
68935,large_airport,,large_airport,


In [22]:
df4 = pd.DataFrame({'key': df['type'],
                   'B': dfAF['frequency_mhz']})
df5 = df4.loc[df4["key"] == 'small_airport']
df6 = pd.DataFrame({'key': df['iso_country'],
                   'B': df['type']})
df7 = pd.concat(df5, df6)
sns.relplot(x=df7['iso_country'], y=df7['frequency_mhz'], data=df7)


  df7 = pd.concat(df5, df6)


TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"