<hr style="border:2px solid gray">

#**NOTE:**  Run all cells until Step 1

### Import libraries and define the data path

In [None]:
# import the necessary libraries 

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
%%capture

!pip install category_encoders==2.*
from category_encoders import OrdinalEncoder

In [None]:
#Update the DATA_PATH variable

import sys

if 'google.colab' in sys.modules:
  # If you're on Colab:
  DATA_PATH = 'https://raw.githubusercontent.com/bloominstituteoftechnology/ds_code_along_unit_2/main/data/flight/'
else:
  # If you're working locally:
  DATA_PATH = '..../data/'

### Wrangle the dataset

We are going to work on the Flight dataset today. Instead of predicting the price, we will covert the problem into a binary classification task, and try to predict if the flight is operated by a particular airline of interest or not.

The wrangle function below is very similar to the previous code-along notebook, with a few differences. We will no longer keep the ```Date_of_Journey``` column as we will be using use cross-validation in today's notebook, and want to avoid working with time-series cross-validation. This topic will be covered in Unit 4. Furthermore, we will be working with only the train csv file and not the test file. 

In [None]:
def wrangle(filepath):
  df = pd.read_excel(filepath)

  # Drop row(s) where there is/are missing values
  df.dropna(inplace = True)
  
  # Duration is in a string format. Converting duration into minutes.
  df['Duration'] = df['Duration'].apply(convert_duration)

  # formatting the stops. 'non-stop' output is replaced by 0 in the Total_Stops,
  # and the rest of the values would be filled in by the number of stops

  df['Total_Stops'] = df['Total_Stops'].str.split(" ").str[0]
  df['Total_Stops'].replace('non-stop' , 0 , inplace = True)

  # Converting the stops dtype to int
  df['Total_Stops'] = df["Total_Stops"].astype(int)

  # drop the columns that are not needed
  df.drop(columns=['Date_of_Journey', 'Route','Dep_Time','Arrival_Time','Additional_Info'], inplace=True)
  
  # ADD: create new binary target column


  # ADD: drop airline column to prevent data leakage



  return df

def convert_duration(duration):
    if len(duration.split()) == 2:
        hours = int(duration.split()[0][:-1])
        minutes = int(duration.split()[1][:-1])
        return hours * 60 + minutes
    else:
        return int(duration[:-1]) * 60


df = wrangle(DATA_PATH + 'Data_Train.xlsx')

In [None]:
# cleaned datasets after wrangling
df.head()

In [None]:
# do we have any null values? 
df.info()

<hr style="border:2px solid gray">

#**STEP: 1/4** - Convert multi-class target into binary target

In [None]:
# we will modify our multi-class problem to a binary problem
# we will predict whether or not the flight is being operated Jet Airways

df['Airline'].value_counts()

In [None]:
  # ADD following lines to the Wrangle function 

  # # create new binary target column
  # df['Jet_Airways'] = 

  # # drop airline column to prevent data leakage




NOTE: Final wrangle function should look like this:

In [None]:
# final version of wrangle function

def wrangle(filepath):
  df = pd.read_excel(filepath)

  # Drop row(s) where there is/are missing values
  df.dropna(inplace = True)
  
  # Duration is in a string format. Converting duration into minutes.
  df['Duration'] = df['Duration'].apply(convert_duration)

  # formatting the stops. 'non-stop' output is replaced by 0 in the Total_Stops,
  # and the rest of the values would be filled in by the number of stops

  df['Total_Stops'] = df['Total_Stops'].str.split(" ").str[0]
  df['Total_Stops'].replace('non-stop' , 0 , inplace = True)

  # Converting the stops dtype to int
  df['Total_Stops'] = df["Total_Stops"].astype(int)

  # drop the columns that are not needed
  df.drop(columns=['Date_of_Journey','Route','Dep_Time','Arrival_Time','Additional_Info'], inplace=True)

  # create new binary target column
  df['Jet_Airways'] = 

  # drop airline column to prevent data leakage


  return df

def convert_duration(duration):
    if len(duration.split()) == 2:
        hours = int(duration.split()[0][:-1])
        minutes = int(duration.split()[1][:-1])
        return hours * 60 + minutes
    else:
        return int(duration[:-1]) * 60


df = wrangle(DATA_PATH + 'Data_Train.xlsx')


In [None]:
df.head()

In [None]:
# Split the data into Feature Matrix and Target Vector

target = 'Jet_Airways'
y = df[target]
X = df.drop(columns=target)

<hr style="border:2px solid gray">

#**STEP: 2/4** - Randomized Search method

In [None]:
# Build model pipeline

clf = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42, n_jobs=-1)
)

# Build dictionary with parameters names (str) as keys and distributions or lists of parameters to try.

param_dist = {
 
}


In [None]:
# RandomizedSearchCV Method

model_rf_random = 

model_rf_random.fit(X, y)

In [None]:
# Parameter setting that gave the best results on the hold out data.


In [None]:
# Mean cross-validated score of the best_estimator


<hr style="border:2px solid gray">

#**STEP: 3/4** - Re-train a new model and build confusion matrix

In [None]:
# Randomly split data into Train and Validation sets 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Determine the baseline for our classification task
print('BASELINE accuracy', y_train.value_counts(normalize=True).max())

In [None]:
# Build a new Random Forest model with the Search Method results

model_rf = 

model_rf.fit(X_train,y_train);


In [None]:
# Evaluate the model performance and compare it to the baseline

print('RF training accuracy:', accuracy_score(y_train, model_rf.predict(X_train)))
print('RF validation accuracy:', accuracy_score(y_val, model_rf.predict(X_val)))

In [None]:
# Build confusion matrix

plot_confusion_matrix(

); 

In [None]:
# precision = tp /(tp+fp)
# recall = tp /(tp+fn)
# accuracy = (tn+tp)/(tn+fp+fn+tp)
# f1-score = 2 * (precision * recall) / (precision + recall)

<hr style="border:2px solid gray">

#**STEP: 4/4** - Modify classification threshold

In [None]:
# Build classification report 

print(classification_report(y_val,
                            model_rf.predict(X_val),
                            target_names=['Other Airline', 'Jet Airways'])) 

In [None]:
# Determine model predictions 
y_pred = model_rf.predict(X_val)

# Select a new threshold value
classification_threshold = 

# Determine the prediction probabilities
y_pred_proba = model_rf.predict_proba(X_val)[:, -1]


In [None]:
y_pred = model_rf.predict(X_val)
y_pred_new = y_pred.copy()

In [None]:
# Apply the new threshold to get new predictions



In [None]:
# Check accuracy metric with new classification threshold
print('RF validation accuracy -- new threshold:', accuracy_score(y_val, y_pred_new))

In [None]:
# Build classification report with new predictions

print(classification_report(y_val,
                            y_pred_new,
                            target_names=['Other Airline', 'Jet Airways'])) 