In [1]:
# http://roycekimmons.com/system/generate_data.php?dataset=exams&n=100

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
data_exams = pd.read_csv('exams.csv')
data_exams.sample(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
37,male,group B,associate's degree,standard,none,74,62,65
20,female,group C,some college,free/reduced,completed,74,87,94
73,female,group C,high school,free/reduced,completed,71,85,88
93,male,group A,bachelor's degree,standard,none,56,62,57
11,male,group E,some college,free/reduced,none,69,62,62


In [3]:
data_exams.describe()

Unnamed: 0,math score,reading score,writing score
count,100.0,100.0,100.0
mean,66.7,70.15,69.44
std,13.170598,14.081563,15.042244
min,36.0,36.0,32.0
25%,57.0,61.0,59.75
50%,68.0,71.0,69.0
75%,74.0,81.0,80.25
max,97.0,100.0,100.0


In [4]:
# preprocessing data 
# standardize values'math score','reading score','writing score ' ( 0 mean and unit variance ) 
# allows to compare values across diffrent distribution

from sklearn import preprocessing
data_exams[['math score']] = preprocessing.scale(data_exams[['math score']].astype('float64'))
data_exams[['reading score']] = preprocessing.scale(data_exams[['reading score']].astype('float64'))
data_exams[['writing score']] = preprocessing.scale(data_exams[['writing score']].astype('float64'))

In [5]:
data_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group C,some college,standard,completed,0.32813,0.988511,1.17326
1,female,group C,some high school,free/reduced,completed,0.022893,-0.082078,0.171045
2,female,group C,some high school,standard,completed,0.480748,1.131256,1.17326
3,male,group E,master's degree,standard,none,1.930623,0.917138,0.70556
4,male,group C,associate's degree,free/reduced,none,-1.045436,-1.509531,-1.566129


In [6]:
data_exams.describe()

Unnamed: 0,math score,reading score,writing score
count,100.0,100.0,100.0
mean,-1.931788e-16,-4.063416e-16,1.287859e-16
std,1.005038,1.005038,1.005038
min,-2.342693,-2.437374,-2.501529
25%,-0.7401993,-0.6530593,-0.6474311
50%,0.09920196,0.06066672,-0.02939832
75%,0.5570572,0.7743928,0.7222632
max,2.312169,2.130472,2.041847


In [7]:
data_exams['parental level of education'].unique()

array(['some college', 'some high school', "master's degree",
       "associate's degree", 'high school', "bachelor's degree"],
      dtype=object)

In [8]:
# to encode values of  'parental level of education' 
# we know there is an intrinsic ordering in the level of education categorical field
# we put the values in the right order in order to encode them later 

parent_level_of_education = [
    "some high school",
    "high school",
    "some college",
    "associate's degree",
    "bachelor's degree",
    "master's degree"
]

In [9]:
label_encoding  = preprocessing.LabelEncoder()
label_encoding = label_encoding.fit(parent_level_of_education)

In [10]:
# transform column 'parental level of education ' into encoded values
data_exams['parental level of education'] = label_encoding.transform(data_exams['parental level of education'].astype(str))

In [11]:
label_encoding.classes_

array(["associate's degree", "bachelor's degree", 'high school',
       "master's degree", 'some college', 'some high school'],
      dtype='<U18')

In [12]:
# we use one hot encoding for categorical values with no intrinsic order ''race/ethnicity','gender','lunch','test preparation course'
data_exams = pd.get_dummies(data_exams, columns = ['race/ethnicity','gender','lunch','test preparation course'])
data_exams.sample(5)

Unnamed: 0,parental level of education,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,gender_female,gender_male,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
4,0,-1.045436,-1.509531,-1.566129,0,0,1,0,0,0,1,1,0,0,1
13,5,-1.350673,-0.93855,-0.964799,0,1,0,0,0,1,0,1,0,0,1
64,2,0.099202,-0.082078,-0.563913,0,0,1,0,0,0,1,0,1,0,1
52,2,-0.434962,-1.652276,-1.298871,0,0,1,0,0,0,1,0,1,0,1
96,5,-1.808528,-2.437374,-2.501529,0,1,0,0,0,0,1,1,0,1,0


In [14]:
# set up trainig data and test data 
from sklearn.model_selection import train_test_split 
X = data_exams.drop('math score', axis = 1)
Y = data_exams['math score']
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)

In [16]:
# fit a linear model to our Data 
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression(fit_intercept = True).fit(x_train, y_train) #(fit_intercept = True ) might cause perfect collinearity dummu variable trap 


In [17]:
print("training score :", linear_model.score(x_train, y_train))

training score : 0.8554628687662025


In [19]:
y_pred = linear_model.predict(x_test)

In [21]:
from sklearn.metrics import r2_score
print("test score :", r2_score(y_test, y_pred))

test score : 0.8923277571699865


In [22]:
linear_model = LinearRegression(fit_intercept = False).fit(x_train, y_train) #(fit_intercept = True ) might cause perfect collinearity dummu variable trap 


In [23]:
print("training score :", linear_model.score(x_train, y_train))

training score : 0.8554628687662025


In [24]:
y_pred = linear_model.predict(x_test)
print("test score :", r2_score(y_test, y_pred))

test score : 0.8923277571699857


In [26]:
prd_actl = pd.DataFrame({'predicted':y_pred,'actual': y_test})
prd_actl.sample(5)

Unnamed: 0,predicted,actual
73,0.351236,0.32813
11,0.001442,0.175511
56,0.15778,-0.282344
3,1.527126,1.930623
54,-0.55648,-0.816508


In [31]:
prd_actl.sample(10)

Unnamed: 0,predicted,actual
50,-0.885264,-1.5796
27,-1.003021,-1.5796
58,-0.966538,-0.740199
73,0.351236,0.32813
51,-0.330469,-0.816508
11,0.001442,0.175511
72,-1.394766,-1.198054
57,0.727821,0.25182
66,0.297716,0.25182
68,0.901946,0.404439
