In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [3]:
path_train = r"Data\Polynomial Features Train.csv"
path_test = r"Data\Polynomial Features Test.csv"

In [4]:
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

In [5]:
X_train = train[['ExterQual', 'AllFlrsSF', 'GrLivArea', 'SimplOverallCond', 'GarageArea', 'TotRmsAbvGrd', 'LotFrontage']]
y_train = train['SalePrice']
X_test = test[['ExterQual', 'AllFlrsSF', 'GrLivArea', 'SimplOverallCond', 'GarageArea', 'TotRmsAbvGrd', 'LotFrontage']]
y_test = test['SalePrice']

In [6]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [7]:
X_train['GarageScore-2'] = X_train.GarageScore**2
X_train['GarageScore-3'] = X_train.GarageScore**3
X_test['GarageScore-2'] = X_test.GarageScore**2
X_test['GarageScore-3'] = X_test.GarageScore**3

AttributeError: 'DataFrame' object has no attribute 'GarageScore'

In [8]:
print(X_train.head())
print(X_test.head())

   ExterQual  AllFlrsSF  GrLivArea  SimplOverallCond  GarageArea  \
0  -0.680315   1.237289   1.228274         -0.351898   -0.125832   
1  -0.680315  -0.387131  -0.402470         -0.351898   -0.031478   
2  -0.680315   0.709286   0.698216         -0.351898   -2.206341   
3   2.551393   0.564660   0.553026         -0.351898    1.501777   
4   1.098222   0.930414   0.920204         -0.351898    0.945088   

   TotRmsAbvGrd  LotFrontage  
0      1.916946    -1.719611  
1     -0.241480     0.466850  
2      1.916946     0.676510  
3      0.396191     1.155734  
4      1.461798     0.856219  
   ExterQual  AllFlrsSF  GrLivArea  SimplOverallCond  GarageArea  \
0   1.098222  -0.141918  -0.156303         -0.351898    0.029853   
1   1.098222   0.709286   0.698216         -0.351898    1.105490   
2   1.098222   2.073298   2.067537         -0.351898    1.615002   
3   1.098222  -0.028399  -0.042342         -0.351898    0.468599   
4   2.551393   0.267408   0.254617         -0.351898    0.416705 

In [9]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
preds = model.predict(X_test)

In [11]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse

0.19319984236387952

In [12]:
new_X_train = X_train.iloc[:,:100]
new_X_test = X_test.iloc[:,:100]
model_2 = LinearRegression()
model_2.fit(new_X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
pred_2 = model_2.predict(new_X_test)
rmse_2 = np.sqrt(mean_squared_error(y_test,pred_2))
rmse_2

0.19319984236387952

In [14]:
lasso = Lasso()
lasso.fit(X_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
lasso_pred = lasso.predict(X_test)

In [16]:
lasso_rmse = mean_squared_error(y_test, lasso_pred)**0.5
lasso_rmse

0.3954951681887703

In [17]:
zero_features = lasso.coef_==0
zero_features = len(zero_features[lasso.coef_==0])
zero_features

7

In [18]:
ridge = Ridge()

In [19]:
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

In [20]:
ridge_rmse = round(np.sqrt(mean_squared_error(y_test, ridge_pred)),2)
print(ridge_rmse)

0.19


In [21]:
zero_features = ridge.coef_==0
zero_features = len(zero_features[lasso.coef_==0])
zero_features

7

# Holdout Method

In [22]:
import math
from sklearn.model_selection import train_test_split

In [23]:
train_path = "Data\Holdout Method Train.csv"
test_path = "Data\Holdout Method Test.csv"

In [24]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [25]:
y_train = train.SalePrice
X_train = train[train.columns[train.columns != 'SalePrice']]
y_test = test.SalePrice
X_test = test[test.columns[test.columns != 'SalePrice']]

In [26]:
print(train.shape)
print(test.shape)
print(X_train.shape)
print(y_train.shape)

(1019, 290)
(437, 290)
(1019, 289)
(1019,)


In [27]:
train_feat, test_feat, train_tar, test_tar = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [28]:
print(train_feat.shape)
print(test_feat.shape)
print(train_tar.shape)
print(test_tar.shape)

(764, 289)
(255, 289)
(764,)
(255,)


In [29]:
l1 = Lasso()
l2 = Ridge()

In [30]:
l1.fit(train_feat,train_tar)
l2.fit(train_feat,train_tar)

  positive)


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [31]:
pred_l1 = l1.predict(test_feat)
pred_l2 = l2.predict(test_feat)

In [32]:
rmse_l1 = np.sqrt(mean_squared_error(test_tar,pred_l1))
rmse_l2 = np.sqrt(mean_squared_error(test_tar,pred_l2))
print(rmse_l1)
print(rmse_l2)

30125.61981780325
27364.939271900137


In [33]:
selected_model = l1 if rmse_l1<rmse_l2 else l2

In [34]:
selected_model_pred = selected_model.predict(X_test)
rmse_selected_model_pred = round(np.sqrt(mean_squared_error(y_test, selected_model_pred)),2)
print(rmse_selected_model_pred)

26978.56


In [35]:
print(np.sqrt(mean_squared_error(y_test, selected_model_pred)))
print(round(np.sqrt(mean_squared_error(y_test, selected_model_pred)),2))
print(np.round(np.sqrt(mean_squared_error(y_test, selected_model_pred)),2))

26978.558998724795
26978.56
26978.56


# CrossValidation

In [36]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [37]:
scorer = make_scorer(mean_squared_error, greater_is_better = False)

In [38]:
L1 = Lasso()
L2 = Ridge()

In [39]:
rmse_L1 = np.sqrt(-np.mean(cross_val_score(L1, X_train, y_train, scoring=scorer, cv=10)))
rmse_L2 = np.sqrt(-np.mean(cross_val_score(L2, X_train, y_train, scoring=scorer, cv=10)))

In [40]:
Model = L1 if rmse_L1<rmse_L2 else L2

In [41]:
Model.fit(X_train, y_train)
Pred = Model.predict(X_test)

In [42]:
Error = np.round(np.sqrt(mean_squared_error(y_test, Pred)),2)
print(Error)

25798.3


# Hyperparamter Tuning

In [43]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [44]:
ridge_lambdas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]
lasso_lambdas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]

In [45]:
ridge_model   = Ridge()
lasso_model  = Lasso()

In [46]:
ridge_grid = GridSearchCV(ridge_model, param_grid=dict(alpha=ridge_lambdas))
ridge_grid.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10,
                                   30, 60]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [47]:
lasso_grid = GridSearchCV(lasso_model, param_grid=dict(alpha=lasso_lambdas))
lasso_grid.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006,
                                   0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [48]:
lasso_pred = lasso_grid.predict(X_test)
ridge_pred = ridge_grid.predict(X_test)

In [49]:
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))

In [50]:
best_model = "LASSO" if lasso_rmse<ridge_rmse else "RIDGE"
print(best_model)

RIDGE


# Assignment

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [52]:
path = r"Data\melbourne_housing.csv"

In [53]:
df = pd.read_csv(path)
df.shape

(6830, 16)

In [54]:
df.head()

Unnamed: 0,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Longtitude,Regionname,Propertycount
0,2,0,1035000,1,23,2.5,3067,1,0,156,79.0,1900,31,144.9934,2,4019
1,3,0,1465000,3,23,2.5,3067,2,0,134,150.0,1900,31,144.9944,2,4019
2,4,0,1600000,4,155,2.5,3067,1,2,120,142.0,2014,31,144.9941,2,4019
3,3,0,1876000,1,155,2.5,3067,2,0,245,210.0,1910,31,144.9993,2,4019
4,2,0,1636000,1,155,2.5,3067,1,2,256,107.0,1890,31,144.9954,2,4019


In [55]:
X = df[df.columns[df.columns != 'Price']]
y = df['Price']

In [56]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=6)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(4781, 15) (2049, 15) (4781,) (2049,)


In [57]:
corr = X_train.corr()
print(corr)

                  Rooms      Type    Method   SellerG  Distance  Postcode  \
Rooms          1.000000 -0.590817 -0.053525 -0.029506  0.289699  0.050753   
Type          -0.590817  1.000000  0.088308  0.026955 -0.242188  0.018737   
Method        -0.053525  0.088308  1.000000 -0.008889 -0.064479 -0.034679   
SellerG       -0.029506  0.026955 -0.008889  1.000000  0.024476 -0.001913   
Distance       0.289699 -0.242188 -0.064479  0.024476  1.000000  0.414110   
Postcode       0.050753  0.018737 -0.034679 -0.001913  0.414110  1.000000   
Bathroom       0.607276 -0.276890 -0.013801 -0.034939  0.114934  0.088003   
Car            0.420030 -0.278868 -0.019285 -0.000271  0.257264  0.024650   
Landsize       0.092030 -0.043433 -0.021601 -0.028470  0.063563  0.030973   
BuildingArea   0.595395 -0.386768 -0.048287 -0.042596  0.145823  0.062701   
YearBuilt     -0.056373  0.304479  0.021274 -0.000680  0.252163  0.027168   
CouncilArea   -0.207234  0.159736  0.018771 -0.002784 -0.280674 -0.059066   

In [58]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [59]:
r2 = r2_score(y_test, y_pred)
print(r2)

0.610875922874349


In [60]:
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

In [61]:
r2_lasso = r2_score(y_test, lasso_pred)
print(r2_lasso)

0.6108776228380524


In [62]:
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

In [63]:
r2_ridge = r2_score(y_test, ridge_pred)
print(r2_ridge)

0.6112942316771472


In [64]:
regressor = LinearRegression()
score = cross_val_score(regressor, X=X_train, y=y_train, cv=10)
mean_score = np.mean(score)
print(mean_score)

0.6101198231670764


In [65]:
model = make_pipeline(PolynomialFeatures(2), LinearRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_poly = r2_score(y_test, y_pred)
print(r2_poly)

0.7137059670522334
