In [None]:
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
import plotly.express as px

# Loading the dataset
df = pd.read_csv('AB_NYC_2019.csv')

# Working on Categorical Variables
neighbourhood_c = pd.CategoricalDtype(categories=list(df['neighbourhood'].unique()))
df["neighbourhood"] = df["neighbourhood"].astype(neighbourhood_c)

neighbourhood_group_c = pd.CategoricalDtype(categories=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'])
df["neighbourhood_group"] = df["neighbourhood_group"].astype(neighbourhood_group_c)

room_type_c = pd.CategoricalDtype(categories=['Private room', 'Entire home/apt', 'Shared room'])
df["room_type"] = df["room_type"].astype(room_type_c)


# Dropping the uninmportant features
df.drop(['name','id','host_name', 'host_id','last_review','reviews_per_month'], axis=1, inplace=True)

# Now we need to remove the extreme values
for feature in ['price',	'number_of_reviews',	'calculated_host_listings_count',	'availability_365','minimum_nights','longitude','latitude']:  
  df = df[df[feature] < (3*df[feature].std() +df[feature].mean())]

# converting the needed categorical variables to numbers
df_final = pd.get_dummies(df, columns=["room_type", "neighbourhood_group"], prefix='is')

numeric_features = ['latitude', 'longitude', 'number_of_reviews', 'minimum_nights','calculated_host_listings_count','availability_365']

train_features = list(set(df_final.columns) - set(['price','host_id','neighbourhood', 'neighbourhood_group']))

df_train, df_test = train_test_split(df_final, test_size=0.2)

X_train = df_train[train_features]
y_train = df_train['price']

X_test = df_test[train_features]
y_test = df_test['price']

**implementing a
linear surface for our dataset.** 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from scipy import stats

linreg = LinearRegression()
linreg.fit(X_train, y_train)

yhat_test = linreg.predict(X_test)

pearson_coef, p_value = stats.pearsonr(yhat_test, y_test)
print('Pearson Correlation: {}, p_value {}'.format(pearson_coef,p_value))

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, yhat_test))
print('Mean Squared Error:',metrics.mean_squared_error(y_test,yhat_test))
print('Root Mean Squared Error:',
np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('Coefficient of determination: {}'.format(metrics.r2_score (y_test,yhat_test)))

Pearson Correlation: 0.597305987650946, p_value 0.0
Mean Absolute Error: 53.61240130258792
Mean Squared Error: 7619.490834394869
Root Mean Squared Error: 87.28969489232317
Coefficient of determination: 0.356558828840138


**Using scikit-learn’s sklearn.linear_model.Ridge to implement linear least squares with L2 regularization for dataset using the default parameters**

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

yhat_test = ridge_model.predict(X_test)

pearson_coef, p_value = stats.pearsonr(yhat_test, y_test)
print('Pearson Correlation: {}, p_value {}'.format(pearson_coef,p_value))

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, yhat_test))
print('Mean Squared Error:',metrics.mean_squared_error(y_test,yhat_test))
print('Root Mean Squared Error:',
np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('Coefficient of determination: {}'.format(metrics.r2_score (y_test,yhat_test)))

Pearson Correlation: 0.5972238607983219, p_value 0.0
Mean Absolute Error: 53.58696009538456
Mean Squared Error: 7621.040141560634
Root Mean Squared Error: 87.2985689548267
Coefficient of determination: 0.3564279948987509


**Using scikit-learn’s sklearn.linear_model.Ridge to implement linear least squares with L2 regularization for dataset tweak the value of alpha and report your findings.**


In [None]:
for a in range(0,11):
  ridge_model = Ridge(alpha=a/10)
  ridge_model.fit(X_train, y_train)

  yhat_test = ridge_model.predict(X_test)
  print('Ridge alpha: {}'.format(a/10))
  print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
  print('Coefficient of determination: {}'.format(metrics.r2_score (y_test,yhat_test)))


Ridge alpha: 0.0
Root Mean Squared Error: 87.61602276414047
Coefficient of determination: 0.35173889507082023
Ridge alpha: 0.1
Root Mean Squared Error: 87.2904200129717
Coefficient of determination: 0.3565481385881485
Ridge alpha: 0.2
Root Mean Squared Error: 87.2911848923749
Coefficient of determination: 0.35653686208765833
Ridge alpha: 0.3
Root Mean Squared Error: 87.29198805503555
Coefficient of determination: 0.3565250210765857
Ridge alpha: 0.4
Root Mean Squared Error: 87.29282807926528
Coefficient of determination: 0.35651263649957254
Ridge alpha: 0.5
Root Mean Squared Error: 87.29370359500867
Coefficient of determination: 0.3564997285401199
Ridge alpha: 0.6
Root Mean Squared Error: 87.29461328176725
Coefficient of determination: 0.35648631665124075
Ridge alpha: 0.7
Root Mean Squared Error: 87.29555586661543
Coefficient of determination: 0.3564724195847554
Ridge alpha: 0.8
Root Mean Squared Error: 87.29653012230517
Coefficient of determination: 0.3564580554192702
Ridge alpha: 0.9


**Use scikit-learn’s sklearn.linear_model.Lasso to implement linear least squares with L1 regularization for  dataset using the default parameters.**


In [None]:
from sklearn.linear_model import Lasso

lasso_model = Lasso()
lasso_model.fit(X_train, y_train)

yhat_test = lasso_model.predict(X_test)

pearson_coef, p_value = stats.pearsonr(yhat_test, y_test)
print('Pearson Correlation: {}, p_value {}'.format(pearson_coef,p_value))

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, yhat_test))
print('Mean Squared Error:',metrics.mean_squared_error(y_test,yhat_test))
print('Root Mean Squared Error:',
np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('Coefficient of determination: {}'.format(metrics.r2_score (y_test,yhat_test)))

Pearson Correlation: 0.5785052437855621, p_value 0.0
Mean Absolute Error: 54.454598877219965
Mean Squared Error: 7896.822471468587
Root Mean Squared Error: 88.8640673808519
Coefficient of determination: 0.33313907583605595


**Using scikit-learn’s sklearn.linear_model.Lasso to implement linear least squares with L1 regularization for dataset tweak the value of alpha and report your findings.**

In [None]:
for a in range(1,11):
  lasso_model = Lasso(alpha=a/10)
  lasso_model.fit(X_train, y_train)

  yhat_test = lasso_model.predict(X_test)
  print('Ridge alpha: {}'.format(a/10))
  print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
  print('Coefficient of determination: {}'.format(metrics.r2_score (y_test,yhat_test)))

Ridge alpha: 0.1
Root Mean Squared Error: 87.44874131432643
Coefficient of determination: 0.3542119252821455
Ridge alpha: 0.2
Root Mean Squared Error: 87.80025644141826
Coefficient of determination: 0.3490097802325979
Ridge alpha: 0.3
Root Mean Squared Error: 88.3213241567805
Coefficient of determination: 0.34125999556957254
Ridge alpha: 0.4
Root Mean Squared Error: 88.64141071624945
Coefficient of determination: 0.3364766456192123
Ridge alpha: 0.5
Root Mean Squared Error: 88.67690539407303
Coefficient of determination: 0.33594514994750213
Ridge alpha: 0.6
Root Mean Squared Error: 88.72647012596595
Coefficient of determination: 0.33520261391090966
Ridge alpha: 0.7
Root Mean Squared Error: 88.77752078123001
Coefficient of determination: 0.3344373832936709
Ridge alpha: 0.8
Root Mean Squared Error: 88.81778822747849
Coefficient of determination: 0.33383347850550427
Ridge alpha: 0.9
Root Mean Squared Error: 88.84744891808529
Coefficient of determination: 0.33338847173980635
Ridge alpha: 1.

Providing a comparison table of the performance for all of the ML models using the different losses


In [None]:
from sklearn import metrics

print('Algorithm'," "*27,"|MAE"," "*3,"|MSE"," "*5,"|RMSE "," "*3,"|R2 SCORE|")
print("-"*77)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
yhat_test = linreg.predict(X_test)
MAE =  round(metrics.mean_absolute_error(y_test, yhat_test),3)
MSE = round(metrics.mean_squared_error(y_test,yhat_test),3)
RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, yhat_test)),4)
R2SCORE = round(metrics.r2_score(y_test,yhat_test),3)
print("sklearn.linear_model.LinearRegression |",MAE, "|", MSE, "|", RMSE, " | ", R2SCORE, "|")


ridge_model = Ridge(alpha = 0.1)
ridge_model.fit(X_train, y_train)
yhat_test = ridge_model.predict(X_test)
MAE =  round(metrics.mean_absolute_error(y_test, yhat_test),3)
MSE = round(metrics.mean_squared_error(y_test,yhat_test),3)
RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, yhat_test)),4)
R2SCORE = round(metrics.r2_score(y_test,yhat_test),3)
print("sklearn.linear_model.Ridge", " "*10, "|",MAE, "|",MSE, "|", RMSE, " | ", R2SCORE, "|")

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
yhat_test = lasso_model.predict(X_test)
MAE =  round(metrics.mean_absolute_error(y_test, yhat_test),3)
MSE = round(metrics.mean_squared_error(y_test,yhat_test),3)
RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, yhat_test)),4)
R2SCORE = round(metrics.r2_score(y_test,yhat_test),3)
print("sklearn.linear_model.Lasso", " "*10, "|",MAE, "|",MSE, "|", RMSE, " | ", R2SCORE, "|")

Algorithm                             |MAE     |MSE       |RMSE      |R2 SCORE|
-----------------------------------------------------------------------------
sklearn.linear_model.LinearRegression | 53.612 | 7619.491 | 87.2897  |  0.357 |
sklearn.linear_model.Ridge            | 53.609 | 7619.617 | 87.2904  |  0.357 |
sklearn.linear_model.Lasso            | 53.521 | 7647.282 | 87.4487  |  0.354 |
