# Multiple Linear Regression - Scikitlrn

## Load Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from scipy import stats
import seaborn as sns
 
pd.options.display.float_format = '{:,.1f}'.format


## Load and Verify Data

In [2]:
df = pd.read_csv("data/academicperformance.csv")

In [3]:
df.head()

Unnamed: 0,GPA,FamilyIncome,Sleep,District,StudyTime,Grade
0,2.9,82461,6.5,4,47,77
1,3.7,61113,6.2,6,47,94
2,2.8,63632,6.2,5,39,69
3,2.0,66854,7.2,5,49,81
4,2.8,82721,5.5,5,49,78


In [4]:
df.describe()

Unnamed: 0,GPA,FamilyIncome,Sleep,District,StudyTime,Grade
count,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0
mean,2.7,75282.6,6.0,5.0,39.1,69.2
std,0.8,15264.4,1.0,1.0,9.4,14.5
min,0.2,27952.0,2.5,1.0,7.0,19.0
25%,2.2,65073.0,5.3,4.0,32.0,60.0
50%,2.8,75040.0,6.0,5.0,39.0,70.0
75%,3.3,85492.0,6.7,6.0,46.0,79.0
max,4.0,128655.0,10.1,8.0,70.0,100.0


## Scikitlrn

#### Regression without Categorical Variable

In [5]:
X = df[['GPA','Sleep','StudyTime']]
y = df['Grade']

In [6]:
mlr = LinearRegression().fit(X,y)

In [7]:
print(mlr.coef_)
print(mlr.intercept_)

[9.0991645  7.20697182 1.057971  ]
-39.70982467889576


In [8]:
y_predict=mlr.predict(X)

In [9]:
print(mean_squared_error(y,y_predict))
print(r2_score(y,y_predict))

22.736315002179442
0.8910750996529091


#### Test and Train

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=4)
mlr = LinearRegression().fit(X_train,y_train)

In [11]:
y_predict=mlr.predict(X_test)
print(mean_squared_error(y_test,y_predict))
print(r2_score(y_test,y_predict))


23.418210308086437
0.8941025820707256


### Cross Validation

In [12]:
print(cross_val_score(mlr,X,y,cv=5).mean())

0.890264748413107


## Regression with Categorical Variables

In [13]:
X = df[['GPA','Sleep','StudyTime','District']]
y = df['Grade']

In [14]:
X = pd.get_dummies(X, columns=['District'],drop_first=True)

In [15]:
mlr = LinearRegression().fit(X,y)

In [16]:
X.head()

Unnamed: 0,GPA,Sleep,StudyTime,District_2,District_3,District_4,District_5,District_6,District_7,District_8
0,2.9,6.5,47,0,0,1,0,0,0,0
1,3.7,6.2,47,0,0,0,0,1,0,0
2,2.8,6.2,39,0,0,0,1,0,0,0
3,2.0,7.2,49,0,0,0,1,0,0,0
4,2.8,5.5,49,0,0,0,1,0,0,0


In [17]:
print(cross_val_score(mlr,X,y,cv=5).mean())

0.9860280440554534


In [18]:
print(mlr.coef_)
print(mlr.intercept_)

[ 9.43493348  7.24373634  1.0780224   5.0052977   9.41837909 13.76475462
 18.15891903 22.52454188 26.97795838 31.23146742]
-59.68410373541681


## Standardized

In [19]:
df_s = df.drop('District',axis=1)
scaler = StandardScaler()
df_s=pd.DataFrame(scaler.fit_transform(df_s),columns=df_s.columns)

In [20]:
X = df_s[['GPA','Sleep','StudyTime']]
y = df_s['Grade']

In [21]:
mlr = LinearRegression().fit(X,y)

In [22]:
print(mlr.coef_)
print(mlr.intercept_)

[0.48655645 0.50495972 0.69139578]
-3.750854393150304e-16
