In [32]:
#load all necessary libraries
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

import statsmodels.api as sm
import matplotlib.pyplot as plt

In [33]:
#read the dataset
airfields_df = pd.read_csv('Resources/Airfields.csv') 
airfields_df.head()

Unnamed: 0,Name,Country,Archipelago,Latitude,Longitude,Type,Runway_1,Runway_2,Surface,Class
0,Kadena Air Base,U.S.,Ryukyu Islands,26.35569,127.767875,Military,12100,12100,Asphalt/Concrete,Class_3
1,Guam Intl Airport,U.S.,Mariana Islands,13.485298,144.800812,Commercial,12015,10014,Asphalt/Concrete,Class_3
2,Andersen AFB,U.S.,Mariana Islands,13.588225,144.920208,Military,10527,11200,Asphalt/Concrete,Class_3
3,Naha Airport,Japan,Ryukyu Islands,26.206403,127.646542,Commercial,9843,8858,Asphalt,Class_3
4,Saipan Intl Airport,U.S.,Mariana Islands,15.119743,145.728279,Commercial,8700,7001,Asphalt,Class_3


In [34]:
# Bin runway surface values with Replace function 

airfields_df = airfields_df.replace(['Asphalt','Concrete','Asphalt/Concrete'],'Paved')
airfields_df = airfields_df.replace(['Coral','Macadam'],'Hard')
airfields_df = airfields_df.replace(['Turf/gravel'],'Gravel')
airfields_df = airfields_df.replace(['Turf'],'Grass')
airfields_df.head()

Unnamed: 0,Name,Country,Archipelago,Latitude,Longitude,Type,Runway_1,Runway_2,Surface,Class
0,Kadena Air Base,U.S.,Ryukyu Islands,26.35569,127.767875,Military,12100,12100,Paved,Class_3
1,Guam Intl Airport,U.S.,Mariana Islands,13.485298,144.800812,Commercial,12015,10014,Paved,Class_3
2,Andersen AFB,U.S.,Mariana Islands,13.588225,144.920208,Military,10527,11200,Paved,Class_3
3,Naha Airport,Japan,Ryukyu Islands,26.206403,127.646542,Commercial,9843,8858,Paved,Class_3
4,Saipan Intl Airport,U.S.,Mariana Islands,15.119743,145.728279,Commercial,8700,7001,Paved,Class_3


In [35]:
airfields_size=airfields_df['Class'].value_counts()
airfields_type=airfields_df['Type'].value_counts()
print(airfields_size)
print(airfields_type)

Class_0    95
Class_1    44
Class_2    18
Class_3     7
Name: Class, dtype: int64
Unimproved    63
Air taxi      40
Military      26
Commercial    20
General       15
Name: Type, dtype: int64


In [36]:
airfields_params_df = airfields_df.drop(['Name','Country','Archipelago','Latitude','Longitude'], axis=1)
airfields_params_df.head()

Unnamed: 0,Type,Runway_1,Runway_2,Surface,Class
0,Military,12100,12100,Paved,Class_3
1,Commercial,12015,10014,Paved,Class_3
2,Military,10527,11200,Paved,Class_3
3,Commercial,9843,8858,Paved,Class_3
4,Commercial,8700,7001,Paved,Class_3


In [37]:
airfields_cat_params_df = airfields_params_df.drop(['Runway_1','Runway_2','Class'], axis=1)
airfields_cat_params_df.head()

Unnamed: 0,Type,Surface
0,Military,Paved
1,Commercial,Paved
2,Military,Paved
3,Commercial,Paved
4,Commercial,Paved


In [38]:
# Generate our categorical variable lists
airfield_cats = airfields_cat_params_df.dtypes[airfields_cat_params_df.dtypes == "object"].index.tolist()
airfield_cats

['Type', 'Surface']

In [39]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

In [40]:
# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(airfields_cat_params_df[airfield_cats]))

In [41]:
# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(airfield_cats)
encode_df.head()

Unnamed: 0,Type_Air taxi,Type_Commercial,Type_General,Type_Military,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [42]:
airfields_runways_df = airfields_params_df.drop(['Type','Surface'], axis=1)
airfields_runways_df.head()

Unnamed: 0,Runway_1,Runway_2,Class
0,12100,12100,Class_3
1,12015,10014,Class_3
2,10527,11200,Class_3
3,9843,8858,Class_3
4,8700,7001,Class_3


In [43]:
# Merge the two DataFrames together and drop the original categorical variables
airfields_newparams_df = airfields_runways_df.join(encode_df)
airfields_newparams_df.head()

Unnamed: 0,Runway_1,Runway_2,Class,Type_Air taxi,Type_Commercial,Type_General,Type_Military,Type_Unimproved,Surface_Grass,Surface_Gravel,Surface_Hard,Surface_Paved
0,12100,12100,Class_3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,12015,10014,Class_3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,10527,11200,Class_3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,9843,8858,Class_3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,8700,7001,Class_3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [44]:
#Create training and test datasets

X = airfields_newparams_df.drop(['Class'], axis=1) 
y = airfields_newparams_df['Class']

print(list(X.columns.values)) 

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['Runway_1', 'Runway_2', 'Type_Air taxi', 'Type_Commercial', 'Type_General ', 'Type_Military', 'Type_Unimproved', 'Surface_Grass', 'Surface_Gravel', 'Surface_Hard', 'Surface_Paved']
(131, 11)
(33, 11)
(131,)
(33,)


In [45]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
model1 = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg').fit(X_train_scaled, y_train)
preds = model1.predict(X_test_scaled)

#print the tunable parameters (They were not tuned in this example, everything kept as default)
params = model1.get_params()
print(params)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 0, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [48]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

Intercept: 
 [ 12.04025125  14.24975218 -19.75129916  -6.53870427]
Coefficients: 
 [[-1.85655568e+01 -7.36969321e+00  6.12361737e-01 -4.61931032e+00
   4.49398643e-01 -3.04728365e-01  2.46081568e+00 -2.83918808e+00
   2.14090337e+00  2.10982437e-01  1.41429656e+00]
 [-1.00557421e+01 -8.47741583e+00 -1.20096228e-01  2.02487651e+00
   1.19822511e-01 -1.00630883e+00 -5.04538462e-01 -2.20384946e+00
   1.84757860e+00 -9.68294326e-02  1.18804530e+00]
 [ 2.45221049e+01  2.78575736e+00 -1.65188091e+00  5.23910983e-01
   6.31352053e+00  9.63634537e-03 -2.66124570e+00  5.05984191e+00
  -4.42305755e+00 -2.01390968e+00 -1.19485758e+00]
 [ 4.09919398e+00  1.30613517e+01  1.15961540e+00  2.07052283e+00
  -6.88274169e+00  1.30140085e+00  7.04968482e-01 -1.68043704e-02
   4.34575587e-01  1.89975668e+00 -1.40748428e+00]]


In [49]:
#Calculate odds ratio estimates
import numpy as np
np.exp(model1.coef_)

array([[8.65129528e-09, 6.30061447e-04, 1.84478315e+00, 9.85959366e-03,
        1.56736935e+00, 7.37323630e-01, 1.17143628e+01, 5.84731224e-02,
        8.50711923e+00, 1.23489067e+00, 4.11359178e+00],
       [4.29384842e-05, 2.08115815e-04, 8.86835094e-01, 7.57517541e+00,
        1.12729675e+00, 3.65565860e-01, 6.03784180e-01, 1.10377446e-01,
        6.34443846e+00, 9.07710819e-01, 3.28066223e+00],
       [4.46493182e+10, 1.62120915e+01, 1.91689019e-01, 1.68861891e+00,
        5.51984814e+02, 1.00968292e+00, 6.98611414e-02, 1.57565604e+02,
        1.19974932e-02, 1.33465844e-01, 3.02747067e-01],
       [6.02916715e+01, 4.70406119e+05, 3.18870667e+00, 7.92896755e+00,
        1.02532906e-03, 3.67444039e+00, 2.02378290e+00, 9.83336035e-01,
        1.54430750e+00, 6.68426781e+00, 2.44758253e-01]])

In [51]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y_train,sm.add_constant(X_train_scaled))
logit_model
result=logit_model.fit()
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

Optimization terminated successfully.
         Current function value: nan
         Iterations 25
                          MNLogit Regression Results                          
Dep. Variable:                  Class   No. Observations:                  131
Model:                        MNLogit   Df Residuals:                      101
Method:                           MLE   Df Model:                           27
Date:                Tue, 13 Sep 2022   Pseudo R-squ.:                     nan
Time:                        20:25:49   Log-Likelihood:                    nan
converged:                       True   LL-Null:                       -138.25
Covariance Type:            nonrobust   LLR p-value:                       nan
Class=Class_1       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const                nan        nan        nan        nan         nan         nan
x1                   nan

  eXB = np.column_stack((np.ones(len(X)), np.exp(X)))
  return eXB/eXB.sum(1)[:,None]


In [None]:
#Create a confusion matrix
#y_test as first argument and the preds as second argument 
confusion_matrix(y_test, preds)

In [None]:
#transform confusion matrix into array
#the matrix is stored in a vaiable called confmtrx
confmtrx = np.array(confusion_matrix(y_test, preds))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(confmtrx, index=['Class_0','Class_1','Class_2','Class_3'],
columns=['predicted_Class_0','predicted_Class_1','predicted_Class_2','predicted_Class_3'])

In [None]:
#Accuracy statistics

print('Accuracy Score:', metrics.accuracy_score(y_test, preds))  

In [None]:
#Create classification report
class_report=classification_report(y_test, preds)
print(class_report)