In [45]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import pickle
import sklearn.model_selection as model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [46]:
# import warnings
# warnings.filterwarnings('ignore')

In [47]:
# Read from extracted data
flights = pd.read_csv("flights-2022-10-25-1326.csv")

In [48]:
# Take copy to measure data loss after clean-up
flights_df = flights.copy()

## Random Forest

In [None]:
# Assign train and target variables
## Add columns to X
X = flights_df[['columns......']]   # features
y = flights_df['arr_delay']    # labels

X_train,X_test,y_train,y_test = model_selection.train_test_split(X, y, train_size = 0.75, test_size = 0.25,random_state = 42)

In [None]:
display(flights_df.head(3))
display(flights_df.shape)

In [None]:
# Create a Gaussian Classifier
model = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
model.fit(X_train,y_train)

# Save the model
filename = 'Random_Forest.sav'
pickle.dump(model,open(filename, 'wb'))

y_pred = model.predict(X_test)

In [None]:
# Calculate the absolute errors
errors = abs(y_pred - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Print the R2 score for the test dataset
print('R2 Score for train data:', round(model.score(X_train,y_train),3))
print('R2 Score2 for test data:', round(r2_score(y_test,y_pred),3))
print('MAE Score for test data: ', round(mean_absolute_error(y_test,y_pred),2))

df = pd.DataFrame(
        {
            'actual':y_test,
            'predicted':y_pred
        }
)
df

sns.regplot(x='predicted',y='actual',data=df,label='R2:'+ str(round(r2_score(y_test,y_pred),3)))
plt.legend(loc=0)
plt.show()

#### DRAFT

## Random Forest Grid Search

In [None]:
#define hyperparameters we want to tune
param_grid = {
    'n_estimators' : [5,10,15,100,150],
    'criterion' : ['gini,', 'entropy'],
    'max_depth' : [5,8,10,12,15]
}

#instantiate GridSearchCV, fit model, and make prediction
model = GridSearchCV(RandomForestClassifier(), param_grid = param_grid, cv = 5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

df = pd.DataFrame(model.cv_results_)

In [None]:
display(model.best_params_)
display(model.best_score_)

df.sort_values('rank_test_score', ascending = True).head(3)

In [None]:
def correlation(df):
    """
    Plot the correlation matrix.
    Returns the dataframe with the correlation values.
    """

    # Create a mask to exclude the redundant cells that make up half of the graph.
    mask = np.triu(np.ones_like(df.corr(), dtype=bool))

    # Create the heatmap with the mask and with annotation
    sns.heatmap(data=df.corr(numeric_only=True),mask=mask,annot=True)
    return df.corr()