## DS 5220 - Project : Modelling (Non - Linear Regression, KNN, SVM)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

#### Import processed data

In [2]:
d = pd.read_csv("C:/Users/aditi/OneDrive/Desktop/NEU/NEU Summer 2021/DS 5220/Project/data.csv")

#### Modelling

In [41]:
Y = d[["TotalTimeStopped_p20", "TotalTimeStopped_p50", "TotalTimeStopped_p80", "DistanceToFirstStop_p20","DistanceToFirstStop_p50", "DistanceToFirstStop_p80" ]]
X = d[[ "UNIIntersection", 'Latitude', 'Longitude',
         'EntryHeading', 'ExitHeading', 'Hour', 'Weekend',
       'Month', 'EncodedEntryAddress', 'EncodedExitAddress', 'IfEntryEqualsExit',
       'Atlanta', 'Boston', 'Chicago', 'Philadelphia', 'Peak_Hours', 'vacationornot']]


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split(X, Y, test_size = 0.2, random_state = 1)

#### Non linear regression

In [43]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as MSE
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [44]:
X_train.head(5)

Unnamed: 0,Intersection,Latitude,Longitude,EntryHeading,ExitHeading,Hour,Weekend,Month,EntryAdressEncoded,ExitAddressEncoded,IfEntryExitSame,Atlanta,Boston,Chicago,Philadelphia,Peak_Hours,IsVacation
179120,1354,42.339052,-71.057133,1.0,1.0,1,1,7,1.0,1.0,True,0,1,0,0,0,1
48689,1218,33.774239,-84.40234,1.75,1.25,14,0,12,4.0,4.0,False,1,0,0,0,0,1
808951,2026,39.997006,-75.117383,1.5,1.0,13,0,10,1.0,0.0,False,0,0,0,1,0,0
355120,4509,41.866034,-87.734972,0.0,0.0,8,0,7,1.0,1.0,True,0,0,1,0,0,1
770206,1533,40.026178,-75.09697,0.5,0.5,22,0,6,2.0,2.0,True,0,0,0,1,0,1


#### Without regularisation (lasso)

In [102]:
poly_features = PolynomialFeatures(degree = 3, include_bias = False)

In [103]:
X_poly = poly_features.fit_transform(X_train)
X_poly_vali = poly_features.fit_transform(X_validate)

In [104]:
reg = LinearRegression()
reg.fit(X_poly, y_train)
y_pred = reg.predict(X_poly)

In [105]:
y_pred_vali = reg.predict(X_poly_vali)

#### Degree - 2

In [91]:
rmse = np.sqrt(MSE(y_train, y_pred))
rmse

73.52781437953914

In [92]:
mse = MSE(y_train, y_pred)
mse

5406.339487431962

In [94]:
r2_score(y_train, y_pred)

0.05081096356872039

In [99]:
rmse = np.sqrt(MSE(y_pred_vali, y_validate))
rmse

73.92075510484706

In [100]:
mse = MSE(y_pred_vali, y_validate)
mse

5464.278035270773

In [101]:
r2_score(y_pred_vali, y_validate)

-24.766888798579572

#### Degree - 3

In [68]:
rmse = np.sqrt(MSE(y_train, y_pred))
rmse

72.54692074596993

In [69]:
mse = MSE(y_train, y_pred)
mse

5263.055709722042

In [70]:
r2_score(y_train, y_pred)

0.07914117884777616

In [106]:
rmse = np.sqrt(MSE(y_pred_vali, y_validate))
rmse

73.09800881454407

In [107]:
mse = MSE(y_pred_vali, y_validate)
mse

5343.318892651164

In [108]:
r2_score(y_pred_vali, y_validate)

-13.702297468719271

#### With regularisation (lasso)

In [45]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor

In [46]:
poly_features = PolynomialFeatures(degree = 3, include_bias = False)

In [47]:
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)

In [49]:
X_poly = poly_features.fit_transform(X_train_scaled)

In [1]:
# Using GD

In [50]:
reg = SGDRegressor(alpha = 1, penalty = 'l1', tol = 0.01, early_stopping = True)
clf = MultiOutputRegressor(reg)
clf.fit(X_poly, y_train)

MultiOutputRegressor(estimator=SGDRegressor(alpha=1, early_stopping=True,
                                            penalty='l1', tol=0.01))

In [51]:
y_train_predict = clf.predict(X_poly)

#### alpha - 1

In [55]:
rmse = np.sqrt(MSE(y_train, y_train_predict))

In [56]:
rmse

6178206240597.008

In [57]:
mse = MSE(y_train, y_train_predict)
mse

3.8170232351351816e+25

In [58]:
clf.score(X_poly, y_train)

-2.6755865759247167e+23

In [59]:
r2_score(y_train, y_train_predict)

-2.6755865759247167e+23

#### alpha - 0.1

In [76]:
rmse = np.sqrt(MSE(y_train, y_train_predict))
rmse

3.459216294821604e+30

In [77]:
mse = MSE(y_train, y_train_predict)
mse

1.1966177374359305e+61

In [78]:
r2_score(y_train, y_train_predict)

-3.5376433531781697e+58

#### Using lasso with grid search

In [45]:
X_train_10, X_validate_10, y_train_10, y_validate_10 = train_test_split(X, Y, test_size = 0.9, random_state = 1)

In [46]:
from sklearn.pipeline import Pipeline

def PolynomialRegression(degree = 2, alpha = 1):
    return Pipeline([('polyfeatures', PolynomialFeatures(degree)), 
                    ('lasso', Lasso(alpha))])

In [47]:
param_grid = {
    'polyfeatures__degree': np.arange(1, 6),
    'lasso__alpha': np.logspace(-3, 2, num = 6)
}

In [49]:
grid = GridSearchCV(PolynomialRegression(), param_grid, n_jobs = -1)

In [None]:
grid.fit(X_train_10, y_train_10)

### KNN

In [109]:
#import required packages
from sklearn import neighbors

In [110]:
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)

In [112]:
knn = neighbors.KNeighborsRegressor(n_neighbors = 50)

knn.fit(X_train_scaled, y_train) 
y_pred = knn.predict(X_train_scaled)

In [114]:
rmse = np.sqrt(MSE(y_train, y_pred))
rmse

70.68708925270451

In [115]:
mse = MSE(y_train, y_pred)
mse

4996.664587019813

In [116]:
r2_score(y_train, y_pred)

0.12143018327415633

In [117]:
X_vali_scaled = scalar.fit_transform(X_validate)

In [123]:
y_vali_pred = knn.predict(X_vali_scaled)

In [120]:
rmse = np.sqrt(MSE(y_validate, y_vali_pred))
rmse

72.5639252360339

In [121]:
mse = MSE(y_validate, y_vali_pred)
mse

5265.523245660716

In [122]:
r2_score(y_validate, y_vali_pred)

0.08502069329911321

### SVM

In [None]:
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline

#### Linear 

In [None]:
regr = make_pipeline(StandardScaler(),
                     LinearSVR(dual = False))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_train)
y_pred_vali = regr.predict(X_validate)

In [None]:
rmse = np.sqrt(MSE(y_train, y_pred))
rmse

In [None]:
mse = MSE(y_train, y_pred)
mse

In [None]:
r2_score(y_train, y_pred)

In [None]:
rmse = np.sqrt(MSE(y_validate, y_pred_vali))
rmse

In [None]:
mse = MSE(y_validate, y_pred_vali)
mse

In [None]:
r2_score(y_validate, y_pred_vali)

#### Non - Linear

In [None]:
from sklearn.svm import SVR

In [None]:
regressor = make_pipeline(StandardScaler(), SVR(kernel = 'rbf'))
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
y_pred_vali = regressor.predict(X_validate)

In [None]:
rmse = np.sqrt(MSE(y_train, y_pred))
rmse

In [None]:
mse = MSE(y_train, y_pred)
mse

In [None]:
r2_score(y_train, y_pred)

In [None]:
rmse = np.sqrt(MSE(y_validate, y_pred_vali))
rmse

In [None]:
mse = MSE(y_validate, y_pred_vali)
mse

In [None]:
r2_score(y_validate, y_pred_vali)