# GPA Predictions Using Sklearn

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot


## Load the dataset

In [3]:
dataset = pd.read_csv("1.02. Multiple linear regression.csv")
dataset.head(4)

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.4,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3


In [4]:
X = dataset[['SAT', 'Rand 1,2,3']]
y = dataset['GPA']

## Fit the dataset on LinearRegression


In [6]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# Getting the coefficients of the regression
regressor.coef_

array([ 0.00165354, -0.00826982])

In [9]:
# Getting the intercept of the regression
regressor.intercept_

0.29603261264909353

In [11]:
# Geting the R-squared of the regression
regressor.score(X,y)

0.40668119528142815

### Formula for Adjusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [18]:
# Get the shape of x, to facilitate the creation of the Adjusted R^2 metric
X.shape

(84, 2)

In [20]:
# If we want to find the Adjusted R-squared we can do so by knowing the r2, the # observations, the # features
r2 = regressor.score(X,y)
# Number of observations is the shape along axis 0
n = X.shape[0]
# Number of features (predictors, p) is the shape along axis 1
p = X.shape[1]

# We find the Adjusted R-squared using the formula
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.39203134825134

## To make predictions

### Prediction with not be accurate as roundom numbers were used in the dataset

In [13]:
y_pred = regressor.predict(X)
y_pred

array([3.12193344, 3.0227167 , 3.18145672, 3.05744108, 3.07893924,
       3.0491776 , 3.19634071, 3.20461053, 3.24263988, 3.33027548,
       3.14011817, 3.21452966, 3.15665782, 3.10208671, 3.21122258,
       3.37492322, 3.17318901, 3.04752194, 3.31704714, 3.2244488 ,
       3.18311026, 3.11862213, 3.02106316, 3.07728781, 3.55185431,
       3.30713012, 3.24264199, 3.28232277, 3.62791723, 3.24594696,
       3.21287824, 3.48571263, 3.22279949, 3.33854319, 3.38815155,
       3.33689176, 3.26082672, 3.50224382, 3.20956693, 3.30712801,
       3.3550786 , 3.34681512, 3.52208633, 3.10209094, 3.57831097,
       3.46256094, 3.29555321, 3.52209055, 3.35838992, 3.54358237,
       3.25090547, 3.35508283, 3.43279718, 2.98138027, 3.39476783,
       3.40799617, 3.14012028, 3.51712993, 3.22445092, 3.40634051,
       3.53531889, 3.26082672, 3.40137778, 3.64610196, 3.41791742,
       3.30878155, 3.33027548, 3.47744281, 3.35673426, 3.46421236,
       3.46751945, 3.22776011, 3.55350785, 3.46917299, 3.61303

## Feature Selection

In [15]:
from sklearn.feature_selection import f_regression

f_regression(X,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [16]:
p_values = f_regression(X,y)[1]
p_values

array([7.19951844e-11, 6.76291372e-01])