# Multiple Linear Regression Lab

## Import the relevant libraries

In [120]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import f_regression
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from itertools import chain, combinations

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [93]:
conda install tqdm

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/Bryan/opt/anaconda3

  added / updated specs:
    - tqdm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.8.3                |           py37_0         2.8 MB
    ------------------------------------------------------------
                                           Total:         2.8 MB

The following packages will be UPDATED:

  conda                                        4.8.2-py37_0 --> 4.8.3-py37_0



Downloading and Extracting Packages
conda-4.8.3          | 2.8 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [121]:
%%capture

from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange
tqdm().pandas()

## Load the data

In [122]:
df = pd.read_csv('kc_house_data_train.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,2591820310,20141006T000000,365000.0,4,2.25,2070,8893,2.0,0,...,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,1,7974200820,20140821T000000,865000.0,5,3.0,2900,6730,1.0,0,...,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,2,7701450110,20140815T000000,1038000.0,4,2.5,3770,10893,2.0,0,...,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,3,9522300010,20150331T000000,1490000.0,3,3.5,4560,14608,2.0,0,...,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,4,9510861140,20140714T000000,711000.0,3,2.5,2550,5376,2.0,0,...,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050
5,5,1761300310,20140827T000000,211000.0,4,2.0,1710,8288,1.5,0,...,7,1710,0,1970,0,98031,47.3947,-122.174,1710,7200
6,6,7732410120,20140819T000000,790000.0,4,2.5,2690,8036,2.0,0,...,9,2690,0,1987,0,98007,47.6596,-122.144,2420,8087
7,7,7010701383,20141017T000000,680000.0,3,2.5,1800,4400,1.0,0,...,7,1350,450,1970,0,98199,47.6599,-122.396,1920,4400
8,8,291310170,20140804T000000,384500.0,3,2.5,1600,2610,2.0,0,...,8,1600,0,2005,0,98027,47.5344,-122.068,1445,1288
9,9,4232901990,20140516T000000,605000.0,2,1.0,910,3600,1.0,0,...,7,910,0,1909,0,98119,47.6341,-122.361,1720,3600


In [123]:
df.shape

(17290, 22)

In [124]:
df.columns

Index(['Unnamed: 0', 'id', 'date', 'price', 'bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition',
       'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

## List all combinations of features

In [125]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view' , 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [126]:
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))

In [127]:
feature_list = list(powerset(features))
feature_list

[(),
 ('bedrooms',),
 ('bathrooms',),
 ('sqft_living',),
 ('sqft_lot',),
 ('floors',),
 ('waterfront',),
 ('view',),
 ('condition',),
 ('grade',),
 ('sqft_above',),
 ('sqft_basement',),
 ('yr_built',),
 ('yr_renovated',),
 ('zipcode',),
 ('lat',),
 ('long',),
 ('sqft_living15',),
 ('sqft_lot15',),
 ('bedrooms', 'bathrooms'),
 ('bedrooms', 'sqft_living'),
 ('bedrooms', 'sqft_lot'),
 ('bedrooms', 'floors'),
 ('bedrooms', 'waterfront'),
 ('bedrooms', 'view'),
 ('bedrooms', 'condition'),
 ('bedrooms', 'grade'),
 ('bedrooms', 'sqft_above'),
 ('bedrooms', 'sqft_basement'),
 ('bedrooms', 'yr_built'),
 ('bedrooms', 'yr_renovated'),
 ('bedrooms', 'zipcode'),
 ('bedrooms', 'lat'),
 ('bedrooms', 'long'),
 ('bedrooms', 'sqft_living15'),
 ('bedrooms', 'sqft_lot15'),
 ('bathrooms', 'sqft_living'),
 ('bathrooms', 'sqft_lot'),
 ('bathrooms', 'floors'),
 ('bathrooms', 'waterfront'),
 ('bathrooms', 'view'),
 ('bathrooms', 'condition'),
 ('bathrooms', 'grade'),
 ('bathrooms', 'sqft_above'),
 ('bathrooms'

In [128]:
feature_list.pop(0)

()

In [129]:
len(feature_list)

262143

In [130]:
feature_list = [list(elem) for elem in feature_list]
feature_list

[['bedrooms'],
 ['bathrooms'],
 ['sqft_living'],
 ['sqft_lot'],
 ['floors'],
 ['waterfront'],
 ['view'],
 ['condition'],
 ['grade'],
 ['sqft_above'],
 ['sqft_basement'],
 ['yr_built'],
 ['yr_renovated'],
 ['zipcode'],
 ['lat'],
 ['long'],
 ['sqft_living15'],
 ['sqft_lot15'],
 ['bedrooms', 'bathrooms'],
 ['bedrooms', 'sqft_living'],
 ['bedrooms', 'sqft_lot'],
 ['bedrooms', 'floors'],
 ['bedrooms', 'waterfront'],
 ['bedrooms', 'view'],
 ['bedrooms', 'condition'],
 ['bedrooms', 'grade'],
 ['bedrooms', 'sqft_above'],
 ['bedrooms', 'sqft_basement'],
 ['bedrooms', 'yr_built'],
 ['bedrooms', 'yr_renovated'],
 ['bedrooms', 'zipcode'],
 ['bedrooms', 'lat'],
 ['bedrooms', 'long'],
 ['bedrooms', 'sqft_living15'],
 ['bedrooms', 'sqft_lot15'],
 ['bathrooms', 'sqft_living'],
 ['bathrooms', 'sqft_lot'],
 ['bathrooms', 'floors'],
 ['bathrooms', 'waterfront'],
 ['bathrooms', 'view'],
 ['bathrooms', 'condition'],
 ['bathrooms', 'grade'],
 ['bathrooms', 'sqft_above'],
 ['bathrooms', 'sqft_basement'],
 ['

## Declare the dependent and independent variables

In [131]:
summary_df = pd.DataFrame(columns=['Model', 'Description','Root Mean Squared Error (RMSE)', 'R-squared (training)', 'Adjusted R-squared (training)','R-squared (test)', 'Adjusted R-squared (test)'])

target = df['price']
i = 0
for feature in tqdm(feature_list):
    features = df[feature]

    X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42,test_size=0.2)
    
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    
    r2_train = reg.score(X_train, y_train)
    
    n = X_train.shape[0]
    p = X_train.shape[1]

    adjusted_r2_train = 1-(1-r2_train)*(n-1)/(n-p-1)
    
    y_train_pred = reg.predict(X_train)
    
    train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
    train_mse = metrics.mean_squared_error(y_train, y_train_pred)
    train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
    
    r2_test = reg.score(X_test, y_test)
    n = X_test.shape[0]
    p = X_test.shape[1]

    adjusted_r2_test = 1-(1-r2_test)*(n-1)/(n-p-1)
    
    y_test_pred = reg.predict(X_test)
    
    test_mae = metrics.mean_absolute_error(y_test, y_test_pred)
    test_mse = metrics.mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    
    summary_df = summary_df.append({'Model' : i, 'Description' : feature , 'Root Mean Squared Error (RMSE)' : train_rmse, 'R-squared (training)' : r2_train, 'Adjusted R-squared (training)': adjusted_r2_train, 'R-squared (test)' : r2_test, 'Adjusted R-squared (test)' : adjusted_r2_test} , ignore_index=True)
    i += 1

HBox(children=(IntProgress(value=0, max=262143), HTML(value='')))




In [132]:
summary_df

Unnamed: 0,Model,Description,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test)
0,0,[bedrooms],357141.084823,0.094856,0.094791,0.085027,0.084763
1,1,[bathrooms],319415.269058,0.275982,0.275930,0.270027,0.269816
2,2,[sqft_living],266555.890933,0.495787,0.495750,0.491225,0.491078
3,3,[sqft_lot],374017.940805,0.007289,0.007217,0.007894,0.007607
4,4,[floors],362744.015518,0.066233,0.066166,0.064638,0.064367
...,...,...,...,...,...,...,...
262138,262138,"[bedrooms, bathrooms, sqft_living, floors, wat...",206664.215624,0.696912,0.696539,0.702825,0.701357
262139,262139,"[bedrooms, bathrooms, sqft_lot, floors, waterf...",206631.857848,0.697007,0.696634,0.702779,0.701310
262140,262140,"[bedrooms, sqft_living, sqft_lot, floors, wate...",207378.434214,0.694814,0.694438,0.700696,0.699217
262141,262141,"[bathrooms, sqft_living, sqft_lot, floors, wat...",208294.351277,0.692112,0.691733,0.696691,0.695192


In [135]:
summary_df.sort_values(['Root Mean Squared Error (RMSE)'])

Unnamed: 0,Model,Description,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test)
262142,262142,"[bedrooms, bathrooms, sqft_living, sqft_lot, f...",206631.857848,0.697007,0.696612,0.702779,0.701223
262132,262132,"[bedrooms, bathrooms, sqft_living, sqft_lot, f...",206631.857848,0.697007,0.696634,0.702779,0.701310
262131,262131,"[bedrooms, bathrooms, sqft_living, sqft_lot, f...",206631.857848,0.697007,0.696634,0.702779,0.701310
262139,262139,"[bedrooms, bathrooms, sqft_lot, floors, waterf...",206631.857848,0.697007,0.696634,0.702779,0.701310
262137,262137,"[bedrooms, bathrooms, sqft_living, sqft_lot, w...",206641.834824,0.696978,0.696605,0.702732,0.701263
...,...,...,...,...,...,...,...
162,162,"[zipcode, long]",374830.981371,0.002969,0.002824,-0.000293,-0.000872
13,13,[zipcode],374850.323805,0.002866,0.002794,-0.000380,-0.000670
123,123,"[condition, long]",374887.847893,0.002666,0.002522,-0.002737,-0.003317
7,7,[condition],375026.389268,0.001929,0.001857,-0.002376,-0.002667


In [133]:
summary_df.to_csv (r'multi_linear_reg_summary.csv', index = False, header=True)