In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from imblearn.under_sampling import RandomUnderSampler

# Method that split input data into that 67% train and 33% test data set
def get_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

def corss_validation(model, X, y):
    score = cross_val_score(model, X, y,scoring='accuracy', cv=3, n_jobs = -1)
    print('Cross Validation Accuracy: %.3f mean with a standard deviation of %.3f' % (score.mean(), score.std()))


# Method that generate metrics for precision & recall, F1 score, as well as conufsion matrix
def classification_metrics(test, predict):
    met = metrics.classification_report(test, predict)
    print("classification metrics:")
    print(met)
    print('\n')
    # compute confusion matrix
    conf_matrix = metrics.confusion_matrix(test, predict)
    ax = sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g')
    ax.set_title('Confusion Matrix\n')
    ax.set_xlabel('Predicted value')
    ax.set_ylabel('Actual Values ')
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])
    plt.show()

def regression_metrics(model, X_train, y_train, kfold):
    scoring = "neg_mean_absolute_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print("Mean Absolute Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "neg_mean_squared_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print() 
    print("Mean Squared Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "r2"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print()
    print("R squared val: ", results.mean())
    print("Standard Deviation: ", results.std())

In [2]:
df = pd.read_csv('data/clean_data.csv')

# Data Transform

In [4]:
y = df['AvailableSpace']
features = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude', 'ParkingTimeLimitCategory', 'BlockfaceName', 'ParkingSpaceCount']]
# ohe = OneHotEncoder()
vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
ct = make_column_transformer(
    (vect,'BlockfaceName'),
    remainder = 'passthrough'
)
X = ct.fit_transform(features)

In [5]:
rus = RandomUnderSampler(random_state=0, replacement=True)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Regression on balanced data

In [6]:
X_train, X_test, y_train, y_test = get_train_test(X_resampled, y_resampled)

In [7]:
rfr = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
rfr = rfr.fit(X_train, y_train)

In [8]:
regression_metrics(rfr, X_train, y_train, 10)

Mean Absolute Error:  -1.1581731372527444
Standard Deviation:  0.011899747139419575

Mean Squared Error:  -2.6514704782154395
Standard Deviation:  0.07404408552371411

R squared val:  0.9647124495071442
Standard Deviation:  0.0011478135836000545


In [9]:
features2 = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude', 'BlockfaceName', 'ParkingSpaceCount']]
vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
X2 = ct.fit_transform(features)
X_resampled2, y_resampled2 = rus.fit_resample(X2, y)
X_train2, X_test2, y_train2, y_test2 = get_train_test(X_resampled2, y_resampled2)

In [10]:
rfr2 = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
rfr2 = rfr2.fit(X_train2, y_train2)

In [11]:
regression_metrics(rfr2, X_train2, y_train2, 10)

Mean Absolute Error:  -1.1581731372527444
Standard Deviation:  0.011899747139419575

Mean Squared Error:  -2.6514704782154395
Standard Deviation:  0.07404408552371411

R squared val:  0.9647124495071442
Standard Deviation:  0.0011478135836000545


In [23]:
features3 = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude', 'BlockfaceName','ParkingTimeLimitCategory']]
vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
ohe = OneHotEncoder()
vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
ct = make_column_transformer(
    (vect,'BlockfaceName'),
    (ohe, ['ParkingTimeLimitCategory']),
    remainder = 'passthrough'
)
X3 = ct.fit_transform(features)
X_resampled3, y_resampled3 = rus.fit_resample(X3, y)
X_train3, X_test3, y_train3, y_test3 = get_train_test(X_resampled3, y_resampled3)

In [42]:
rfr3 = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
rfr3 = rfr3.fit(X_train3, y_train3)

RandomForestRegressor(max_depth=16, n_jobs=-1, random_state=0)

In [43]:
regression_metrics(rfr3, X_train3, y_train3, 10)

Mean Absolute Error:  -1.1582323983565659
Standard Deviation:  0.011906212336118079

Mean Squared Error:  -2.6514012421051807
Standard Deviation:  0.07367194778906538

R squared val:  0.9647133185658721
Standard Deviation:  0.001145400674094431


In [None]:
features4 = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude']]
X_resampled4, y_resampled4 = rus.fit_resample(features4, y)
X_train4, X_test4, y_train4, y_test4 = get_train_test(X_resampled4, y_resampled4)

In [None]:
rfr4 = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
rfr4 = rfr4.fit(X_train4, y_train4)

In [None]:
regression_metrics(rfr4, X_train4, y_train4, 10)

# Front-End Prediction

In [41]:
f = ['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude', 'BlockfaceName','ParkingTimeLimitCategory']
user_in = [5, 600, -122.33651591,47.62326784,'REPUBLICAN ST BETWEEN TERRY AVE N AND BOREN AVE N', 8]
user_df = pd.DataFrame(user_in).T
user_df.columns = f

transformed_user_in = ct.fit_transform(user_df)
transformed_user_in 


array([[0.7071067811865475, 0.35355339059327373, 0.35355339059327373,
        0.35355339059327373, 0.35355339059327373, 1.0, 5, 600,
        -122.33651591, 47.62326784]], dtype=object)

In [44]:
# out = rfr.predict(transformed_user_in)
# out

ValueError: X has 10 features, but DecisionTreeRegressor is expecting 56 features as input.

# Tuning HyperParameters

In [14]:
# from sklearn.model_selection import GridSearchCV
# # dict of parameters settings that has been used in hyperparameters tuning
# param_grid = dict(
#     max_depth = [10, 16, 22],
#     random_state = [0, 42, None]
# )

# grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 0, n_jobs=-1) 

In [15]:
# # fitting the model for grid search 
# grid.fit(X_train, y_train) 
 
# # print best parameter after tuning 
# print(grid.best_params_) 

{'max_depth': 16, 'random_state': 0}
