In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from imblearn.under_sampling import RandomUnderSampler

# Method that split input data into that 67% train and 33% test data set
def get_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

def corss_validation(model, X, y):
    score = cross_val_score(model, X, y,scoring='accuracy', cv=3, n_jobs = -1)
    print('Cross Validation Accuracy: %.3f mean with a standard deviation of %.3f' % (score.mean(), score.std()))


# Method that generate metrics for precision & recall, F1 score, as well as conufsion matrix
def classification_metrics(test, predict):
    met = metrics.classification_report(test, predict)
    print("classification metrics:")
    print(met)
    print('\n')
    # compute confusion matrix
    conf_matrix = metrics.confusion_matrix(test, predict)
    ax = sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g')
    ax.set_title('Confusion Matrix\n')
    ax.set_xlabel('Predicted value')
    ax.set_ylabel('Actual Values ')
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])
    plt.show()

def regression_metrics(model, X_train, y_train, kfold):
    scoring = "neg_mean_absolute_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print("Mean Absolute Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "neg_mean_squared_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print() 
    print("Mean Squared Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "r2"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print()
    print("R squared val: ", results.mean())
    print("Standard Deviation: ", results.std())

In [2]:
df = pd.read_csv('data/clean_data.csv')

# Data Transform

In [3]:
y = df['AvailableSpace']
features = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude']]
# ohe = OneHotEncoder()
# vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
# ct = make_column_transformer(
#     (vect,'BlockfaceName'),
#     remainder = 'passthrough'
# )
# X = ct.fit_transform(features)

In [4]:
rus = RandomUnderSampler(random_state=0, replacement=True)
X_resampled, y_resampled = rus.fit_resample(features, y)

# Regression on balanced data

In [5]:
X_train, X_test, y_train, y_test = get_train_test(X_resampled, y_resampled)

In [12]:
X_train.head()

Unnamed: 0,DayOfTheWeek,MinuteOfTheDay,Latitude,Longitude
82229,2,57720,-122.341183,47.620326
5245,3,40680,-122.34303,47.620961
59458,2,35280,-122.334445,47.621436
100470,4,32640,-122.332884,47.621398
47216,5,51960,-122.333446,47.625072


In [6]:
rfr = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
rfr = rfr.fit(X_train, y_train)

In [7]:
regression_metrics(rfr, X_train, y_train, 10)

Mean Absolute Error:  -1.1610726900125061
Standard Deviation:  0.009916983422415078

Mean Squared Error:  -2.6522454455626905
Standard Deviation:  0.061030722014674355

R squared val:  0.9647029037968784
Standard Deviation:  0.0009779668367292777


# Front-End Prediction

In [8]:
f = ['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude']
user_in = [5, 600, -122.33651591,47.62326784]
user_df = pd.DataFrame(user_in).T
user_df.columns = f

In [9]:
out = rfr.predict(user_df)
out

array([4.71231028])

In [10]:
from joblib import dump
dump(rfr, 'random-forest.joblib')

['random-forest.joblib']

# Tuning HyperParameters

In [14]:
# from sklearn.model_selection import GridSearchCV
# # dict of parameters settings that has been used in hyperparameters tuning
# param_grid = dict(
#     max_depth = [10, 16, 22],
#     random_state = [0, 42, None]
# )

# grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 0, n_jobs=-1) 

In [15]:
# # fitting the model for grid search 
# grid.fit(X_train, y_train) 
 
# # print best parameter after tuning 
# print(grid.best_params_) 

{'max_depth': 16, 'random_state': 0}
