# Instructor Do: Decision Trees

In [1]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
data = "https://airport-efficiency.s3.amazonaws.com/Merged_cleaned_airport_data.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,BRANDED_CODE_SHARE,ORIGIN,DEST,DEST_STATE_NM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,DELAYED,DATE,COVID,DELAY_REASON
0,2018,1,1,UA,BWI,DEN,Colorado,1608,1600.0,-8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-22,0.0,0.0
1,2018,1,1,UA,BWI,ORD,Illinois,1644,1637.0,-7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-22,0.0,0.0
2,2018,1,1,UA,BWI,IAH,Texas,1806,1800.0,-6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-22,0.0,0.0
3,2018,1,1,UA,BWI,IAH,Texas,828,828.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-08,0.0,0.0
4,2018,1,1,UA,BWI,DEN,Colorado,810,802.0,-8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-08,0.0,0.0


In [3]:
df.loc[(df.BRANDED_CODE_SHARE == "UA_CODESHARE"), "BRANDED_CODE_SHARE"] = "UA"
df.loc[(df.BRANDED_CODE_SHARE == "AA_CODESHARE"), "BRANDED_CODE_SHARE"] = "AA"
df.loc[(df.BRANDED_CODE_SHARE == "DL_CODESHARE"), "BRANDED_CODE_SHARE"] = "DL"
df.loc[(df.BRANDED_CODE_SHARE == "AS_CODESHARE"), "BRANDED_CODE_SHARE"] = "AS"


In [4]:
northeast = ['Massachusetts',
'Rhode Island',
'Connecticut',
'Vermont',
'New Hampshire',
'Maine',
'Pennsylvania',
'New Jersey',
'New York'
]

In [5]:
souteast = ['Georgia',
'North Carolina',
'South Carolina',
'Virginia',
'West Virginia',
'Kentucky',
'Tennessee',
'Mississippi',
'Alabama',
'Delaware',
'Maryland',
'Florida',
'Louisiana',
'Arkansas'
]

In [6]:
midwest = ['Minnesota',
'Wisconsin',
'Illinois',
'Ohio',
'Indiana',
'Michigan',
'Missouri',
'Iowa',
'Kansas',
'Nebraska',
'North Dakota',
'South Dakota'
]

In [7]:
southwest = ['New Mexico',
'Arizona',
'Oklahoma',
'Texas'
]

In [8]:
west = ['California',
'Colorado',
'Nevada',
'Hawaii',
'Alaska',
'Oregon',
'Utah',
'Idaho',
'Montana',
'Wyoming',
'Washington'
]

In [9]:
noncontinental = ['U.S. Virgin Islands',
'Puerto Rico'
]

In [10]:
regions=pd.DataFrame([northeast,souteast,midwest,southwest,west,noncontinental],index=['northeast','souteast','midwest','southwest','west','noncontinental'])
regions=regions.reset_index().melt('index')
df['Region'] = df['DEST_STATE_NM'].map(dict(zip(regions['value'],regions['index'])))

In [31]:
# drop more columns
drop = ['DEP_TIME','DEP_DELAY_GROUP',  'DEST_STATE_NM', 'DEST', 'DEP_DELAY', 'DEP_TIME_BLK', 'LATE_AIRCRAFT_DELAY', 'CARRIER_DELAY', 'WEATHER_DELAY', 'SECURITY_DELAY', 'NAS_DELAY', 'DELAY_REASON' ]


In [32]:
df_dropped = df.drop( drop , 1)


  """Entry point for launching an IPython kernel.


In [33]:
df_dropped.columns.values.tolist()

['YEAR',
 'MONTH',
 'DAY_OF_WEEK',
 'BRANDED_CODE_SHARE',
 'ORIGIN',
 'CRS_DEP_TIME',
 'CANCELLED',
 'DIVERTED',
 'DELAYED',
 'DATE',
 'COVID',
 'Region']

In [34]:
brand_df = pd.get_dummies(df_dropped, columns=["BRANDED_CODE_SHARE"])


In [35]:
origin_df = pd.get_dummies(brand_df, columns=["ORIGIN"])

In [36]:
final_df = pd.get_dummies(origin_df, columns=["Region"])

In [37]:
final_df =final_df.drop('DATE',1)

  """Entry point for launching an IPython kernel.


## Loading and Preprocessing Loans Encoded Data

In [38]:
# Define features set
X = final_df.drop(columns=["DELAYED"])
X.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,CANCELLED,DIVERTED,COVID,BRANDED_CODE_SHARE_AA,BRANDED_CODE_SHARE_AS,BRANDED_CODE_SHARE_B6,...,BRANDED_CODE_SHARE_WN,ORIGIN_BWI,ORIGIN_DCA,ORIGIN_IAD,Region_midwest,Region_noncontinental,Region_northeast,Region_souteast,Region_southwest,Region_west
0,2018,1,1,1608,0.0,0.0,0.0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,2018,1,1,1644,0.0,0.0,0.0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,2018,1,1,1806,0.0,0.0,0.0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,2018,1,1,828,0.0,0.0,0.0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,2018,1,1,810,0.0,0.0,0.0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [39]:
# Define target vector
y = final_df["DELAYED"].values.reshape(-1, 1)
y[:5]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [40]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1000)

In [41]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(812203, 26)
(270735, 26)
(812203, 1)
(270735, 1)


In [42]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=1000, train_size=0.80)

In [43]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(866350, 26)
(216588, 26)
(866350, 1)
(216588, 1)


In [44]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [45]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [46]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Decision Tree Model

In [47]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [48]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [49]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

In [50]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [51]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,206935,15558
Actual 1,39695,8547


Accuracy Score : 0.7959148244593421
Classification Report
              precision    recall  f1-score   support

         0.0       0.84      0.93      0.88    222493
         1.0       0.35      0.18      0.24     48242

    accuracy                           0.80    270735
   macro avg       0.60      0.55      0.56    270735
weighted avg       0.75      0.80      0.77    270735

