In [11]:
#load all necessary libraries
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

import statsmodels.api as sm
import matplotlib.pyplot as plt

In [19]:
#read the dataset
airfields_df = pd.read_csv('Resources/Airfields.csv') 
airfields_df.head()

Unnamed: 0,Name,Country,Archipelago,Latitude,Longitude,Type,Runway_1,Runway_2,Surface,Class
0,Kadena Air Base,U.S.,Ryukyu Islands,26.35569,127.767875,Military,12100,12100,Asphalt/Concrete,Class_3
1,Guam Intl Airport,U.S.,Mariana Islands,13.485298,144.800812,Commercial,12015,10014,Asphalt/Concrete,Class_3
2,Andersen AFB,U.S.,Mariana Islands,13.588225,144.920208,Military,10527,11200,Asphalt/Concrete,Class_3
3,Naha Airport,Japan,Ryukyu Islands,26.206403,127.646542,Commercial,9843,8858,Asphalt,Class_3
4,Saipan Intl Airport,U.S.,Mariana Islands,15.119743,145.728279,Commercial,8700,7001,Asphalt,Class_3


In [20]:
# Bin runway surface values with Replace function 

airfields_df = airfields_df.replace(['Asphalt','Concrete','Asphalt/Concrete'],'Paved')
airfields_df = airfields_df.replace(['Coral','Macadam'],'Hard')
airfields_df = airfields_df.replace(['Turf/gravel'],'Gravel')
airfields_df = airfields_df.replace(['Turf'],'Grass')
airfields_df.head()

Unnamed: 0,Name,Country,Archipelago,Latitude,Longitude,Type,Runway_1,Runway_2,Surface,Class
0,Kadena Air Base,U.S.,Ryukyu Islands,26.35569,127.767875,Military,12100,12100,Paved,Class_3
1,Guam Intl Airport,U.S.,Mariana Islands,13.485298,144.800812,Commercial,12015,10014,Paved,Class_3
2,Andersen AFB,U.S.,Mariana Islands,13.588225,144.920208,Military,10527,11200,Paved,Class_3
3,Naha Airport,Japan,Ryukyu Islands,26.206403,127.646542,Commercial,9843,8858,Paved,Class_3
4,Saipan Intl Airport,U.S.,Mariana Islands,15.119743,145.728279,Commercial,8700,7001,Paved,Class_3


In [21]:
airfields_size=airfields_df['Class'].value_counts()
airfields_type=airfields_df['Type'].value_counts()
print(airfields_size)
print(airfields_type)

Class_0    95
Class_1    44
Class_2    18
Class_3     7
Name: Class, dtype: int64
Unimproved    63
Air taxi      40
Military      26
Commercial    20
General       15
Name: Type, dtype: int64


In [22]:
airfields_params_df = airfields_df.drop(['Name','Country','Archipelago','Latitude','Longitude'], axis=1)
airfields_params_df.head()

Unnamed: 0,Type,Runway_1,Runway_2,Surface,Class
0,Military,12100,12100,Paved,Class_3
1,Commercial,12015,10014,Paved,Class_3
2,Military,10527,11200,Paved,Class_3
3,Commercial,9843,8858,Paved,Class_3
4,Commercial,8700,7001,Paved,Class_3


In [23]:
airfields_cat_params_df = airfields_params_df.drop(['Runway_1','Runway_2',], axis=1)
airfields_cat_params_df.head()

Unnamed: 0,Type,Surface,Class
0,Military,Paved,Class_3
1,Commercial,Paved,Class_3
2,Military,Paved,Class_3
3,Commercial,Paved,Class_3
4,Commercial,Paved,Class_3


In [27]:
# Generate our categorical variable lists
airfield_cats = airfields_cat_params_df.dtypes[airfields_cat_params_df.dtypes == "object"].index.tolist()
airfield_cats

['Type', 'Surface', 'Class']

In [28]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

In [29]:
# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(airfields_cat_params_df[airfield_cats]))

In [30]:
# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(airfield_cats)
encode_df.head()

Unnamed: 0,Type_Air taxi,Type_Commercial,Type_General,Type_Military,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved,Class_Class_0,Class_Class_1,Class_Class_2,Class_Class_3
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [33]:
airfields_runways_df = airfields_params_df.drop(['Type','Surface','Class'], axis=1)
airfields_runways_df.head()

Unnamed: 0,Runway_1,Runway_2
0,12100,12100
1,12015,10014
2,10527,11200
3,9843,8858
4,8700,7001


In [37]:
# Merge the two DataFrames together and drop the original categorical variables
airfields_newparams_df = airfields_runways_df.join(encode_df)
airfields_newparams_df.head()

Unnamed: 0,Runway_1,Runway_2,Type_Air taxi,Type_Commercial,Type_General,Type_Military,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved,Class_Class_0,Class_Class_1,Class_Class_2,Class_Class_3
0,12100,12100,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,12015,10014,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,10527,11200,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,9843,8858,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,8700,7001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
#Create training and test datasets
#Type was recoded into Type_num to change from string to integer
#Type needs to be dropped
X = airfields_df.drop(['Class', 'Type'], axis=1) 
y = airfields_df['Class']

print(list(X.columns.values)) 

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model1 = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg').fit(X_train, y_train)
preds = model1.predict(X_test)

#print the tunable parameters (They were not tuned in this example, everything kept as default)
params = model1.get_params()
print(params)

In [None]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

In [None]:
#Calculate odds ratio estimates
import numpy as np
np.exp(model1.coef_)

In [None]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y_train,sm.add_constant(X_train))
logit_model
result=logit_model.fit()
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

In [None]:
#Create a confusion matrix
#y_test as first argument and the preds as second argument 
confusion_matrix(y_test, preds)

In [None]:
#transform confusion matrix into array
#the matrix is stored in a vaiable called confmtrx
confmtrx = np.array(confusion_matrix(y_test, preds))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(confmtrx, index=['Class_0','Class_1','Class_2','Class_3'],
columns=['predicted_Class_0','predicted_Class_1','predicted_Class_2','predicted_Class_3'])

In [None]:
#Accuracy statistics

print('Accuracy Score:', metrics.accuracy_score(y_test, preds))  

In [None]:
#Create classification report
class_report=classification_report(y_test, preds)
print(class_report)