In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.preprocessing import PolynomialFeatures,StandardScaler

from sklearn.linear_model import LinearRegression

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

16512 4128 16512 4128


### Split your train data in train and validation datasets

In [6]:
X_train_v, X_val, y_train_v, y_val = train_test_split(X_train,y_train,test_size=.2, random_state=123)

In [7]:
scaler = StandardScaler()

In [8]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()

In [9]:
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

In [11]:
train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

In [12]:
from sklearn.metrics import mean_squared_error
test_residuals = y_hat_test - y_test

test_mse = mean_squared_error(y_test, y_hat_test)
test_mse

0.5558915986952438

In [13]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

cv_5_results = cross_val_score(linreg, X, y, cv=5, scoring='neg_mean_squared_error')

In [14]:
cv_5_results


array([-0.48485857, -0.62249739, -0.64621047, -0.5431996 , -0.49468484])

### Scale the 3 datasets using StandardScaler

In [15]:
poly = PolynomialFeatures(3)

In [16]:

poly.fit(X);

In [17]:
X_poly = poly.transform(X)

In [18]:
X_poly


array([[ 1.00000000e+00,  8.32520000e+00,  4.10000000e+01, ...,
        -1.75387143e+05,  5.65933749e+05, -1.82613733e+06],
       [ 1.00000000e+00,  8.30140000e+00,  2.10000000e+01, ...,
        -1.75187655e+05,  5.65542397e+05, -1.82568917e+06],
       [ 1.00000000e+00,  7.25740000e+00,  5.20000000e+01, ...,
        -1.75123774e+05,  5.65578076e+05, -1.82658558e+06],
       ...,
       [ 1.00000000e+00,  1.70000000e+00,  1.70000000e+01, ...,
        -1.88463752e+05,  5.79395792e+05, -1.78124164e+06],
       [ 1.00000000e+00,  1.86720000e+00,  1.80000000e+01, ...,
        -1.88619225e+05,  5.80352127e+05, -1.78565356e+06],
       [ 1.00000000e+00,  2.38860000e+00,  1.60000000e+01, ...,
        -1.87921624e+05,  5.78705047e+05, -1.78212344e+06]])

In [19]:
X_poly = pd.DataFrame(X_poly)
X_poly.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,155,156,157,158,159,160,161,162,163,164
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,16.689986,247.389136,-798.267531,3666.952356,-11832.407244,38180.441856,54353.799872,-175387.142512,565933.749452,-1826137.0
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,9.391819,168.531236,-544.054085,3024.204235,-9762.76391,31516.24419,54267.751656,-175187.654712,565542.397224,-1825689.0
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,22.005195,297.223199,-959.909216,4014.580565,-12965.451209,41873.097907,54224.761625,-175123.7744,565578.07616,-1826586.0
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,16.541323,245.723138,-793.650028,3650.24363,-11789.756507,38079.200342,54224.761625,-175138.100625,565670.615625,-1827034.0
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,10.381164,180.120545,-581.763186,3125.218967,-10094.003137,32602.163369,54224.761625,-175138.100625,565670.615625,-1827034.0


In [20]:
poly3 = PolynomialFeatures(3)
X_poly3 = poly3.fit_transform(X)
X_poly3 = pd.DataFrame(X_poly3,columns = poly3.get_feature_names(X.columns))
X_poly3.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup^3,AveOccup^2 Latitude,AveOccup^2 Longitude,AveOccup Latitude^2,AveOccup Latitude Longitude,AveOccup Longitude^2,Latitude^3,Latitude^2 Longitude,Latitude Longitude^2,Longitude^3
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,16.689986,247.389136,-798.267531,3666.952356,-11832.407244,38180.441856,54353.799872,-175387.142512,565933.749452,-1826137.0
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,9.391819,168.531236,-544.054085,3024.204235,-9762.76391,31516.24419,54267.751656,-175187.654712,565542.397224,-1825689.0
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,22.005195,297.223199,-959.909216,4014.580565,-12965.451209,41873.097907,54224.761625,-175123.7744,565578.07616,-1826586.0
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,16.541323,245.723138,-793.650028,3650.24363,-11789.756507,38079.200342,54224.761625,-175138.100625,565670.615625,-1827034.0
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,10.381164,180.120545,-581.763186,3125.218967,-10094.003137,32602.163369,54224.761625,-175138.100625,565670.615625,-1827034.0


In [21]:
from sklearn.preprocessing import StandardScaler 

scaler.fit(X_train_v)

X_train_v = scaler.transform(X_train_v)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [22]:
X_train_v

array([[-1.34353989,  1.69153665, -0.6084429 , ..., -0.05072367,
         0.77174082, -0.45249926],
       [ 1.19224495,  0.26395304,  0.41319107, ..., -0.05081309,
         1.06150452, -1.45397857],
       [-0.7730445 ,  0.97774485, -0.38384179, ..., -0.03682194,
         1.02878926, -1.34436392],
       ...,
       [ 0.85396256, -1.08432037,  1.08314896, ..., -0.0589126 ,
         1.58494863, -0.73650086],
       [ 1.89107958,  0.50188364,  0.37537012, ..., -0.06986339,
        -0.82663316,  0.58883993],
       [ 1.1573764 ,  0.42257344,  0.39381752, ..., -0.01184927,
        -0.65838327,  0.54399757]])

# Modelling and Model Evaluation

### Train a linear regression model

In [29]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [30]:
lin_reg.fit(X_train_v,y_train_v);

In [31]:
lin_reg.score(X_train_v,y_train_v)

0.6141494360849022

In [32]:

lin_reg.score(X_val,y_val)

0.6054381835441691

### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [39]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

print("Mean Absolute Error:", mean_absolute_error(y_test, y_hat_test))

print("Mean squared error:", mean_squared_error(y_test, y_hat_test))

print("r squared score:", r2_score(y_test, y_hat_test))


Mean Absolute Error: 0.5332001304956564
Mean squared error: 0.5558915986952438
r squared score: 0.5757877060324512


### Train a LASSO model

In [33]:
from sklearn.linear_model import Lasso

lasso = Lasso(.01)
lasso.fit(X_train_v,y_train_v)
lasso.score(X_train_v,y_train_v)

0.6099921680220002

### Measure the R-squared, MSE and MAE of your model

In [41]:
lasso_ypred = lasso.predict(X_train_v)
print("Lasso Mean Absolute Error:", mean_absolute_error(y_test, lasso_ypred))

print("Lasso Mean squared error:", mean_squared_error(y_test, lasso_ypred))

print("Lasso r squared score:", r2_score(y_test, lasso_ypred))


ValueError: Found input variables with inconsistent numbers of samples: [4128, 13209]

# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [27]:
# 
# 
# 