In [52]:
#load all necessary libraries
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

import statsmodels.api as sm
import matplotlib.pyplot as plt

In [53]:
#read the dataset
airfields_df = pd.read_csv('Resources/Airfields.csv') 
airfields_df.head()

Unnamed: 0,Name,Country,Archipelago,Latitude,Longitude,Type,Military,Runway_1,Runway_2,Surface,Class
0,Rota Intl Airport,U.S.,Mariana Islands,14.172008,145.243761,Air taxi,No,7000,0,Asphalt,Class_1
1,Angaur Airfield,Palau,Caroline Islands,6.900841,134.13909,Air taxi,No,7000,0,Gravel,Class_1
2,Sanga-Sanga Airport,Philippines,Sulu Archipelago,5.045266,119.742981,Air taxi,No,6102,0,Concrete,Class_1
3,Jolo Airport,Philippines,Sulu Archipelago,6.052973,121.005534,Air taxi,No,6053,0,Asphalt,Class_1
4,Peleliu Airfield,Palau,Caroline Islands,6.998333,134.232778,Air taxi,No,6000,0,Gravel,Class_1


In [54]:
# Bin runway surface values with Replace function 

airfields_df = airfields_df.replace(['Asphalt','Concrete','Asphalt/Concrete'],'Paved')
airfields_df = airfields_df.replace(['Coral','Macadam'],'Hard')
airfields_df = airfields_df.replace(['Turf/gravel'],'Gravel')
airfields_df = airfields_df.replace(['Turf'],'Grass')
airfields_df.head()

Unnamed: 0,Name,Country,Archipelago,Latitude,Longitude,Type,Military,Runway_1,Runway_2,Surface,Class
0,Rota Intl Airport,U.S.,Mariana Islands,14.172008,145.243761,Air taxi,No,7000,0,Paved,Class_1
1,Angaur Airfield,Palau,Caroline Islands,6.900841,134.13909,Air taxi,No,7000,0,Gravel,Class_1
2,Sanga-Sanga Airport,Philippines,Sulu Archipelago,5.045266,119.742981,Air taxi,No,6102,0,Paved,Class_1
3,Jolo Airport,Philippines,Sulu Archipelago,6.052973,121.005534,Air taxi,No,6053,0,Paved,Class_1
4,Peleliu Airfield,Palau,Caroline Islands,6.998333,134.232778,Air taxi,No,6000,0,Gravel,Class_1


In [55]:
airfields_size=airfields_df['Class'].value_counts()
airfields_type=airfields_df['Type'].value_counts()
print(airfields_size)
print(airfields_type)

Class_0    95
Class_1    44
Class_2    18
Class_3     7
Name: Class, dtype: int64
Unimproved    66
Air taxi      46
Commercial    30
General       22
Name: Type, dtype: int64


In [56]:
airfields_params_df = airfields_df.drop(['Name','Country','Archipelago','Latitude','Longitude','Military'], axis=1)
airfields_params_df.head()

Unnamed: 0,Type,Runway_1,Runway_2,Surface,Class
0,Air taxi,7000,0,Paved,Class_1
1,Air taxi,7000,0,Gravel,Class_1
2,Air taxi,6102,0,Paved,Class_1
3,Air taxi,6053,0,Paved,Class_1
4,Air taxi,6000,0,Gravel,Class_1


In [57]:
airfields_cat_params_df = airfields_params_df.drop(['Runway_1','Runway_2','Class'], axis=1)
airfields_cat_params_df.head()

Unnamed: 0,Type,Surface
0,Air taxi,Paved
1,Air taxi,Gravel
2,Air taxi,Paved
3,Air taxi,Paved
4,Air taxi,Gravel


In [38]:
# Generate our categorical variable lists
airfield_cats = airfields_cat_params_df.dtypes[airfields_cat_params_df.dtypes == "object"].index.tolist()
airfield_cats

['Type', 'Surface']

In [59]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(airfields_cat_params_df[airfield_cats]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(airfield_cats)
encode_df.head()

Unnamed: 0,Type_Air taxi,Type_Commercial,Type_General,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [60]:
# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(airfields_cat_params_df[airfield_cats]))

In [61]:
# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(airfield_cats)
encode_df.head()

Unnamed: 0,Type_Air taxi,Type_Commercial,Type_General,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [63]:
# Separate the Runway data
airfields_runways_df = airfields_params_df.drop(['Type','Surface','Class'], axis=1)
airfields_runways_df.head()

Unnamed: 0,Runway_1,Runway_2
0,7000,0
1,7000,0
2,6102,0
3,6053,0
4,6000,0


In [71]:
# Normalize the Runway data
airfields_runways_norm_df = (airfields_runways_df-airfields_runways_df.min())/(airfields_runways_df.max()-airfields_runways_df.min())
airfields_runways_norm_df.head()

Unnamed: 0,Runway_1,Runway_2
0,0.536364,0.0
1,0.536364,0.0
2,0.454727,0.0
3,0.450273,0.0
4,0.445455,0.0


In [68]:
airfields_class_df = airfields_params_df.drop(['Runway_1','Runway_2','Surface','Type'], axis=1)
airfields_class_df.head()

Unnamed: 0,Class
0,Class_1
1,Class_1
2,Class_1
3,Class_1
4,Class_1


In [72]:
# Merge the Class and normalized Runway data
airfields_params2_df = airfields_class_df.join(airfields_runways_norm_df)
airfields_params2_df.head()

Unnamed: 0,Class,Runway_1,Runway_2
0,Class_1,0.536364,0.0
1,Class_1,0.536364,0.0
2,Class_1,0.454727,0.0
3,Class_1,0.450273,0.0
4,Class_1,0.445455,0.0


In [73]:
# Merge the Class, Runway, and encoded data
airfields_newparams_df = airfields_params2_df.join(encode_df)
airfields_newparams_df.head()

Unnamed: 0,Class,Runway_1,Runway_2,Type_Air taxi,Type_Commercial,Type_General,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved
0,Class_1,0.536364,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Class_1,0.536364,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Class_1,0.454727,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Class_1,0.450273,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Class_1,0.445455,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [74]:
#Create training and test datasets

X = airfields_newparams_df.drop(['Class'], axis=1) 
y = airfields_newparams_df['Class']

print(list(X.columns.values)) 

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['Runway_1', 'Runway_2', 'Type_Air taxi', 'Type_Commercial', 'Type_General ', 'Type_Unimproved', 'Surface_Grass', 'Surface_Gravel', 'Surface_Hard', 'Surface_Paved']
(131, 10)
(33, 10)
(131,)
(33,)


In [75]:
model1 = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg').fit(X_train_scaled, y_train)
preds = model1.predict(X_test_scaled)

#print the tunable parameters (They were not tuned in this example, everything kept as default)
params = model1.get_params()
print(params)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 0, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [76]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

Intercept: 
 [ 4.05241432  2.9727539   1.54156551 -8.56673373]
Coefficients: 
 [[-2.02150018e-01  1.54314158e+00 -1.65333003e+00 -1.02164978e+00
  -6.22614988e-01  1.58668388e+00  1.22227511e+00 -1.54493347e+00
   1.13465448e+00 -2.52150100e-01  1.02135617e+00]
 [ 1.63415688e+00  1.98213233e+00 -1.12719593e+00 -1.72742533e+00
   2.53970085e-03  1.65683376e+00  8.05190951e-01 -1.00562263e+00
   5.82732353e-01 -2.98617964e-01  8.18930309e-01]
 [ 1.72388724e-01 -2.67698892e+00 -1.41934241e+00 -1.42013140e+00
  -3.12201121e-01  1.86934179e+00  8.76665746e-01 -1.60498564e+00
   1.02980703e+00 -3.04187303e-01  1.15218924e+00]
 [-1.60439558e+00 -8.48284988e-01  4.19986838e+00  4.16920652e+00
   9.32276409e-01 -5.11285944e+00 -2.90413181e+00  4.15554174e+00
  -2.74719386e+00  8.54955368e-01 -2.99247572e+00]]


In [77]:
#Calculate odds ratio estimates
import numpy as np
np.exp(model1.coef_)

array([[8.16972358e-01, 4.67926748e+00, 1.91411441e-01, 3.60000528e-01,
        5.36539557e-01, 4.88751446e+00, 3.39490274e+00, 2.13326063e-01,
        3.11009876e+00, 7.77128082e-01, 2.77695824e+00],
       [5.12513506e+00, 7.25820337e+00, 3.23940334e-01, 1.77741446e-01,
        1.00254293e+00, 5.24268495e+00, 2.23712364e+00, 3.65816797e-01,
        1.79092519e+00, 7.41842766e-01, 2.26807240e+00],
       [1.18813960e+00, 6.87699146e-02, 2.41873017e-01, 2.41682257e-01,
        7.31834326e-01, 6.48402716e+00, 2.40287454e+00, 2.00892439e-01,
        2.80052535e+00, 7.37722676e-01, 3.16511453e+00],
       [2.01011012e-01, 4.28148583e-01, 6.66775541e+01, 6.46641220e+01,
        2.54028533e+00, 6.01884777e-03, 5.47963436e-02, 6.37865109e+01,
        6.41075036e-02, 2.35126944e+00, 5.01630930e-02]])

In [78]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y_train,sm.add_constant(X_train_scaled))
logit_model
result=logit_model.fit()
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

Optimization terminated successfully.
         Current function value: nan
         Iterations 23
                          MNLogit Regression Results                          
Dep. Variable:                  Class   No. Observations:                  131
Model:                        MNLogit   Df Residuals:                      101
Method:                           MLE   Df Model:                           27
Date:                Tue, 13 Sep 2022   Pseudo R-squ.:                     nan
Time:                        23:11:08   Log-Likelihood:                    nan
converged:                       True   LL-Null:                       -135.93
Covariance Type:            nonrobust   LLR p-value:                       nan
Class=Class_1       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const                nan        nan        nan        nan         nan         nan
x1                   nan

  eXB = np.column_stack((np.ones(len(X)), np.exp(X)))
  return eXB/eXB.sum(1)[:,None]


In [None]:
#Create a confusion matrix
#y_test as first argument and the preds as second argument 
confusion_matrix(y_test, preds)

In [None]:
#transform confusion matrix into array
#the matrix is stored in a vaiable called confmtrx
confmtrx = np.array(confusion_matrix(y_test, preds))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(confmtrx, index=['Class_0','Class_1','Class_2','Class_3'],
columns=['predicted_Class_0','predicted_Class_1','predicted_Class_2','predicted_Class_3'])

In [None]:
#Accuracy statistics

print('Accuracy Score:', metrics.accuracy_score(y_test, preds))  

In [None]:
#Create classification report
class_report=classification_report(y_test, preds)
print(class_report)