In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() #overwrite matplotlib charts with sns

from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('1.02. Multiple linear regression.csv')
data.head()
data.count()[0]

84

In [3]:
data.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


In [16]:
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

## multiple linear regression

In [5]:
reg = LinearRegression()
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
reg.coef_

array([ 0.00165354, -0.00826982])

In [7]:
reg.intercept_

0.29603261264909486

In [8]:
reg.score(x,y)

0.4066811952814285

### r^2 formula
$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [9]:
n = x.shape[0]
p = x.shape[1]
r_squared = reg.score(x,y)
r_squared_adj = 1 - (1-r_squared)*((n -1)/(n - p - 1))
r_squared_adj

0.39203134825134023

### feature selection

In [10]:
from sklearn.feature_selection import f_regression
f_regression(x,y) # returns array(f-statistics[x1, x2.. xn], p-value[x1,x2,..xn])

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [11]:
p_values = f_regression(x,y)[1]
p_values.round(3)

array([0.   , 0.676])

### creating summary table

In [12]:
reg_summary = pd.DataFrame(data = x.columns.values, columns = ['features'])
reg_summary

Unnamed: 0,features
0,SAT
1,"Rand 1,2,3"


In [15]:
reg_summary['coefficient'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,features,coefficient,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


## standardization - allows to compare coefficients of features

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()

In [21]:
scaler.fit(x) # prepared scaling mechanism

StandardScaler(copy=True, with_mean=True, with_std=True)

In [22]:
x_scaled = scaler.transform(x) # for each feature substract the mean and devide by SD

In [25]:
x_scaled[:3]

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974]])

In [26]:
reg_scaled = LinearRegression()
reg_scaled.fit(x_scaled, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [30]:
coef_scaled = reg_scaled.coef_
intercept_scaled = reg_scaled.intercept_
p_values_scaled = (f_regression(x_scaled, y)[1]).round(3)
score_scaled = reg_scaled.score(x_scaled, y)
score_scaled

0.4066811952814283

In [31]:
reg_summary_scaled = pd.DataFrame(data = x.columns.values, columns=['features'])
reg_summary_scaled['coef'] = coef_scaled
reg_summary_scaled['p_values'] = p_values_scaled
reg_summary_scaled

Unnamed: 0,features,coef,p_values
0,SAT,0.171814,0.0
1,"Rand 1,2,3",-0.00703,0.676


In [32]:
reg_summary_scaled = pd.DataFrame([['Intercep'], ['SAT'], ['Rand']], columns=['Features'])
reg_summary_scaled['weights'] = intercept_scaled, coef_scaled[0], coef_scaled[1]
reg_summary_scaled

Unnamed: 0,Features,weights
0,Intercep,3.330238
1,SAT,0.171814
2,Rand,-0.00703


In [33]:
new_data_scaled = scaler.transform(pd.DataFrame(data = [[1700,2], [1640, 1]], columns=['SAT', 'Rand']))
new_data_scaled


array([[-1.39811928, -0.07002087],
       [-1.97556099, -1.24637147]])

In [39]:
reg_scaled.predict(new_data_scaled)

array([3.09051403, 2.99957134])

## remove rand 1,2,3

In [40]:
reg_simple = LinearRegression()
x_simple = x_scaled[:,0].reshape(-1,1)
reg_simple.fit(x_simple, y)
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

array([3.08970998, 2.9903687 ])

### train test split

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
a = np.arange(1,101) # create array with values 1-100 (arange creates array, range - list)

In [43]:
b = np.arange(501,601)

In [50]:
a_train, a_test = train_test_split(a, test_size = 0.2) 
# to keep the order set shuffle=False, to keep the shuffle split same set random_state=42

In [51]:
a_train.shape, a_test.shape

((80,), (20,))

In [52]:
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size = 0.2) 
# to keep the order set shuffle=False, to keep the shuffle split same set random_state=42

In [53]:
b_train

array([592, 531, 543, 561, 508, 532, 523, 502, 552, 570, 534, 511, 596,
       586, 573, 553, 572, 591, 587, 517, 582, 515, 522, 581, 585, 540,
       550, 558, 537, 576, 595, 597, 514, 527, 598, 549, 589, 548, 557,
       556, 516, 555, 579, 529, 539, 575, 509, 560, 578, 562, 525, 535,
       519, 600, 513, 571, 512, 544, 565, 510, 524, 546, 542, 574, 563,
       583, 568, 566, 547, 505, 538, 518, 559, 530, 526, 584, 503, 521,
       545, 504])