In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from imblearn.under_sampling import RandomUnderSampler

# Method that split input data into that 67% train and 33% test data set
def get_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

def corss_validation(model, X, y):
    score = cross_val_score(model, X, y,scoring='accuracy', cv=3, n_jobs = -1)
    print('Cross Validation Accuracy: %.3f mean with a standard deviation of %.3f' % (score.mean(), score.std()))


# Method that generate metrics for precision & recall, F1 score, as well as conufsion matrix
def classification_metrics(test, predict):
    met = metrics.classification_report(test, predict)
    print("classification metrics:")
    print(met)
    print('\n')
    # compute confusion matrix
    conf_matrix = metrics.confusion_matrix(test, predict)
    ax = sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g')
    ax.set_title('Confusion Matrix\n')
    ax.set_xlabel('Predicted value')
    ax.set_ylabel('Actual Values ')
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])
    plt.show()

def regression_metrics(model, X_train, y_train, kfold):
    scoring = "neg_mean_absolute_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print("Mean Absolute Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "neg_mean_squared_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print() 
    print("Mean Squared Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "r2"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print()
    print("R squared val: ", results.mean())
    print("Standard Deviation: ", results.std())

In [3]:
df = pd.read_csv('data/clean_data.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName',
       'SideOfStreet', 'SourceElementKey', 'ParkingTimeLimitCategory',
       'ParkingSpaceCount', 'PaidParkingArea', 'PaidParkingSubArea',
       'ParkingCategory', 'Location', 'OccupancyPercentage', 'HasAvailability',
       'AvailableSpace', 'Latitude', 'Longitude', 'Holiday', 'DayOfTheWeek',
       'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'MinuteOfTheDay'],
      dtype='object')

# Data Transform

In [5]:
y = df['AvailableSpace']
features = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude', 'ParkingTimeLimitCategory', 'BlockfaceName']]
ohe = OneHotEncoder()
vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
ct = make_column_transformer(
    (ohe, ['ParkingTimeLimitCategory']),
    (vect,'BlockfaceName'),
    remainder = 'passthrough'
)
X = ct.fit_transform(features)

In [6]:
rus = RandomUnderSampler(random_state=0, replacement=True)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Regression on balanced data

In [7]:
X_train, X_test, y_train, y_test = get_train_test(X_resampled, y_resampled)

In [8]:
rfr = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
rfr.fit(X_train, y_train)
regression_metrics(rfr, X_train, y_train, 10)

Mean Absolute Error:  -1.1985444580809466
Standard Deviation:  0.012883728879169684

Mean Squared Error:  -2.7606383662658005
Standard Deviation:  0.07808518083370353

R squared val:  0.9632590516339441
Standard Deviation:  0.001225719852069759


# Tuning HyperParameters

In [12]:
from sklearn.model_selection import GridSearchCV
# dict of parameters settings that has been used in hyperparameters tuning
param_grid = dict(
    max_depth = [10, 16, 22],
    random_state = [0, 42, None]
)

grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 0, n_jobs=-1) 

In [13]:
# fitting the model for grid search 
grid.fit(X_train, y_train) 
 
# print best parameter after tuning 
print(grid.best_params_) 

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backen

KeyboardInterrupt: 

# Front-End Prediction