# Feature scaling with sklearn - Exercise

## Import the relevant libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler

## Load the data

In [4]:
data = pd.read_csv('real_estate_price_size_year.csv')

In [5]:
data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


## Create the regression

### Declare the dependent and the independent variables

In [9]:
x = data[["size", "year"]]
y = data["price"]

### Scale the inputs

In [10]:
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

### Regression

In [11]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Find the intercept

In [12]:
reg.intercept_

292289.4701599997

### Find the coefficients

In [13]:
reg.coef_

array([67501.57614152, 13724.39708231])

### Calculate the R-squared

In [14]:
reg.score(x_scaled,y)

0.7764803683276793

### Calculate the Adjusted R-squared

In [15]:
# Let's use the handy function we created
def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [16]:
adj_r2(x_scaled,y)

0.77187171612825

### Making predictions

Find the predicted price of an apartment that has a size of 750 sq.ft. from 2009.

In [20]:
new_data = [[750,2009]]
new_data_scaled = scaler.transform(new_data)

In [21]:
reg.predict(new_data_scaled)

array([258330.34465995])

### Calculate the univariate p-values of the variables

In [22]:
f_regression(x_scaled,y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [23]:
p_values = f_regression(x_scaled,y)[1]
p_values
p_values.round(3)

array([0.   , 0.357])

### Create a summary table with your findings

In [25]:
reg_summary = pd.DataFrame(data = x.columns.values, columns=["Features"])
reg_summary["Coefficients"] = reg.coef_
reg_summary["p-values"] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,67501.576142,0.0
1,year,13724.397082,0.357
