In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# Method that split input data into that 67% train and 33% test data set
def get_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

def corss_validation(model, X, y):
    score = cross_val_score(model, X, y,scoring='accuracy', cv=3, n_jobs = -1)
    print('Cross Validation Accuracy: %.3f mean with a standard deviation of %.3f' % (score.mean(), score.std()))


# Method that generate metrics for precision & recall, F1 score, as well as conufsion matrix
def classification_metrics(test, predict):
    met = metrics.classification_report(test, predict)
    print("classification metrics:")
    print(met)
    print('\n')
    # compute confusion matrix
    conf_matrix = metrics.confusion_matrix(test, predict)
    ax = sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g')
    ax.set_title('Confusion Matrix\n')
    ax.set_xlabel('Predicted value')
    ax.set_ylabel('Actual Values ')
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])
    plt.show()

def regression_metrics(model, X_train, y_train, kfold):
    scoring = "neg_mean_absolute_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print("Mean Absolute Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "neg_mean_squared_error"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print() 
    print("Mean Squared Error: ", results.mean())
    print("Standard Deviation: ", results.std())

    scoring = "r2"
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    print()
    print("R squared val: ", results.mean())
    print("Standard Deviation: ", results.std())

In [2]:
df = pd.read_csv('data/clean_data.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName',
       'SideOfStreet', 'SourceElementKey', 'ParkingTimeLimitCategory',
       'ParkingSpaceCount', 'PaidParkingArea', 'PaidParkingSubArea',
       'ParkingCategory', 'Location', 'OccupancyPercentage', 'HasAvailability',
       'AvailableSpace', 'Latitude', 'Longitude', 'Holiday', 'DayOfTheWeek',
       'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'MinuteOfTheDay'],
      dtype='object')

# Data Transform

In [7]:
y = df['AvailableSpace']
features = df[['DayOfTheWeek', 'MinuteOfTheDay', 'Latitude', 'Longitude', 'ParkingTimeLimitCategory', 'BlockfaceName']]
ohe = OneHotEncoder()
vect = TfidfVectorizer(analyzer='word', lowercase = True, stop_words='english')
ct = make_column_transformer(
    (ohe, ['ParkingTimeLimitCategory']),
    (vect,'BlockfaceName'),
    remainder = 'passthrough'
)
X = ct.fit_transform(features)

In [9]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0, replacement=True)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Regression on balanced data

In [10]:
X_train, X_test, y_train, y_test = get_train_test(X_resampled, y_resampled)
model = RandomForestRegressor(max_depth=16, random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=16, n_jobs=-1, random_state=0)

In [11]:
regression_metrics(model, X_train, y_train, 3)

Mean Absolute Error:  -1.7555723213611742
Standard Deviation:  0.05648922398365396

Mean Squared Error:  -6.173576259301595
Standard Deviation:  0.2538566908087089

R squared val:  0.948682505535515
Standard Deviation:  0.002535482622250676


# Tuning HyperParameters

# Front-End Prediction