## Import Libraries

In [117]:
# import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import statsmodels.api as sm
import statsmodels.formula.api as smf

## Load Data

In [118]:
# we are loading a csv file and creating a dataframe
df = pd.read_csv("data/StudentsPerformance.csv")

In [119]:
# examining the first five records
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [120]:
# shorten the names
df.columns=['gender', 'race', 'parentaled', 'lunch', 'testprep', 'math', 'reading', 'writing']

In [121]:
#create new columns
df = df.assign(totalscore = lambda x: x['math'] + x['reading'] + x['writing'])
df = df.assign(avgscore = lambda x: (x['totalscore'])/3)

In [122]:
#examine the dataframe
df.head()

Unnamed: 0,gender,race,parentaled,lunch,testprep,math,reading,writing,totalscore,avgscore
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


## Trial and Error -- Trying to find the regression model with the best r-squared

### Attempt 1

In [123]:
# define formula that will be used for the regression
formula = "reading ~ parentaled"

# fit the model using ordinary least squares and store results in object called 'slr'
slr = smf.ols(formula,df).fit()

In [124]:
slr.summary()

0,1,2,3
Dep. Variable:,reading,R-squared:,0.045
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,9.289
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,1.17e-08
Time:,12:26:03,Log-Likelihood:,-4076.6
No. Observations:,1000,AIC:,8165.0
Df Residuals:,994,BIC:,8195.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,70.9279,0.960,73.869,0.000,69.044,72.812
parentaled[T.bachelor's degree],2.0721,1.630,1.271,0.204,-1.126,5.270
parentaled[T.high school],-6.2238,1.402,-4.439,0.000,-8.975,-3.472
parentaled[T.master's degree],4.4450,2.095,2.121,0.034,0.333,8.557
parentaled[T.some college],-1.4678,1.352,-1.086,0.278,-4.121,1.185
parentaled[T.some high school],-3.9894,1.437,-2.776,0.006,-6.810,-1.169

0,1,2,3
Omnibus:,12.012,Durbin-Watson:,1.993
Prob(Omnibus):,0.002,Jarque-Bera (JB):,12.173
Skew:,-0.256,Prob(JB):,0.00227
Kurtosis:,2.825,Cond. No.,6.33


### Attempt 2

In [125]:
# define formula that will be used for the regression
formula = "avgscore ~ race + gender + lunch + testprep + parentaled"

# fit the model using ordinary least squares and store results in object called 'slr'
slr = smf.ols(formula,df).fit()

In [126]:
slr.summary()

0,1,2,3
Dep. Variable:,avgscore,R-squared:,0.242
Model:,OLS,Adj. R-squared:,0.233
Method:,Least Squares,F-statistic:,26.3
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,7.55e-52
Time:,12:26:03,Log-Likelihood:,-3937.0
No. Observations:,1000,AIC:,7900.0
Df Residuals:,987,BIC:,7964.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,66.9408,1.775,37.714,0.000,63.458,70.424
race[T.group B],1.5290,1.612,0.949,0.343,-1.634,4.691
race[T.group C],2.3855,1.509,1.581,0.114,-0.576,5.347
race[T.group D],5.1258,1.540,3.329,0.001,2.104,8.147
race[T.group E],6.9285,1.708,4.056,0.000,3.577,10.280
gender[T.male],-3.7242,0.795,-4.682,0.000,-5.285,-2.163
lunch[T.standard],8.7751,0.827,10.605,0.000,7.151,10.399
testprep[T.none],-7.6386,0.830,-9.201,0.000,-9.268,-6.009
parentaled[T.bachelor's degree],2.5356,1.424,1.781,0.075,-0.259,5.330

0,1,2,3
Omnibus:,12.572,Durbin-Watson:,2.043
Prob(Omnibus):,0.002,Jarque-Bera (JB):,12.915
Skew:,-0.274,Prob(JB):,0.00157
Kurtosis:,2.897,Cond. No.,12.9


### Attempt 3

In [127]:
# define formula that will be used for the regression
formula = "avgscore ~ gender + lunch + testprep"

# fit the model using ordinary least squares and store results in object called 'slr'
slr = smf.ols(formula,df).fit()

In [128]:
slr.summary()

0,1,2,3
Dep. Variable:,avgscore,R-squared:,0.172
Model:,OLS,Adj. R-squared:,0.169
Method:,Least Squares,F-statistic:,68.91
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,1.7099999999999998e-40
Time:,12:26:03,Log-Likelihood:,-3981.4
No. Observations:,1000,AIC:,7971.0
Df Residuals:,996,BIC:,7990.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,68.9756,0.959,71.960,0.000,67.095,70.857
gender[T.male],-3.9585,0.823,-4.813,0.000,-5.573,-2.344
lunch[T.standard],8.8598,0.859,10.314,0.000,7.174,10.546
testprep[T.none],-7.8061,0.857,-9.106,0.000,-9.488,-6.124

0,1,2,3
Omnibus:,10.169,Durbin-Watson:,2.095
Prob(Omnibus):,0.006,Jarque-Bera (JB):,10.271
Skew:,-0.248,Prob(JB):,0.00588
Kurtosis:,3.022,Cond. No.,4.48


### Attempt 4 -- This ended up being the highest r squared

In [129]:
# define formula that will be used for the regression
formula = "writing ~ gender + lunch + testprep + race + parentaled"

# fit the model using ordinary least squares and store results in object called 'slr'
slr = smf.ols(formula,df).fit()

In [130]:
slr.summary()

0,1,2,3
Dep. Variable:,writing,R-squared:,0.334
Model:,OLS,Adj. R-squared:,0.326
Method:,Least Squares,F-statistic:,41.25
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,8.17e-79
Time:,12:26:03,Log-Likelihood:,-3936.2
No. Observations:,1000,AIC:,7898.0
Df Residuals:,987,BIC:,7962.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,71.9142,1.774,40.548,0.000,68.434,75.395
gender[T.male],-9.0965,0.795,-11.444,0.000,-10.656,-7.537
lunch[T.standard],8.2028,0.827,9.921,0.000,6.580,9.825
testprep[T.none],-10.0587,0.830,-12.125,0.000,-11.687,-8.431
race[T.group B],1.2201,1.610,0.758,0.449,-1.940,4.380
race[T.group C],2.4126,1.508,1.600,0.110,-0.547,5.372
race[T.group D],5.9307,1.539,3.855,0.000,2.911,8.950
race[T.group E],5.1373,1.707,3.010,0.003,1.788,8.487
parentaled[T.bachelor's degree],3.4849,1.423,2.449,0.014,0.693,6.277

0,1,2,3
Omnibus:,16.647,Durbin-Watson:,2.038
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.222
Skew:,-0.321,Prob(JB):,0.000182
Kurtosis:,2.982,Cond. No.,12.9


In [131]:
# generate model predictions
y_predict = slr.predict()

### Attempt 5

In [132]:
#We tried to use the Polynomial Features package from SciKitLearn 
#to see if reshaping our linear model as a polynomial equation (x^2, etc.) 
#would increase R-Squared. It did not. 

#preparatory to attempt 5, we encoded the parental education and test prep values as ordinal numbers
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
df[["race","parental level of education","lunch","test preparation course"]] = ord_enc.fit_transform(df[["race","parentaled","lunch","testprep"]])

In [133]:
df.head()

Unnamed: 0,gender,race,parentaled,lunch,testprep,math,reading,writing,totalscore,avgscore,parental level of education,test preparation course
0,female,1.0,bachelor's degree,1.0,none,72,72,74,218,72.666667,1.0,1.0
1,female,2.0,some college,1.0,completed,69,90,88,247,82.333333,4.0,0.0
2,female,1.0,master's degree,1.0,none,90,95,93,278,92.666667,3.0,1.0
3,male,0.0,associate's degree,0.0,none,47,57,44,148,49.333333,0.0,1.0
4,male,2.0,some college,1.0,none,76,78,75,229,76.333333,4.0,1.0


In [134]:
#running Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
polynomial_features= PolynomialFeatures(degree=2)
x=df[["race","parental level of education","lunch","test preparation course"]].values.reshape(-1,4)
xp = polynomial_features.fit_transform(x)
y=df["avgscore"].values.reshape(-1,1)

In [135]:
xp.shape, x.shape

((1000, 15), (1000, 4))

In [136]:
xp.shape

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

In [137]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.19
Model:,OLS,Adj. R-squared:,0.181
Method:,Least Squares,F-statistic:,19.35
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,3.44e-38
Time:,12:26:03,Log-Likelihood:,-3970.1
No. Observations:,1000,AIC:,7966.0
Df Residuals:,987,BIC:,8030.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.4857,2.841,24.107,0.000,62.911,74.061
x1,0.5008,1.504,0.333,0.739,-2.450,3.452
x2,-1.4170,1.065,-1.330,0.184,-3.507,0.674
x3,3.0264,1.256,2.409,0.016,0.561,5.491
x4,-5.0452,1.221,-4.133,0.000,-7.441,-2.649
x5,0.1494,0.279,0.536,0.592,-0.397,0.696
x6,0.1661,0.194,0.858,0.391,-0.214,0.546
x7,0.1147,0.746,0.154,0.878,-1.349,1.579
x8,0.6032,0.733,0.822,0.411,-0.836,2.042

0,1,2,3
Omnibus:,8.773,Durbin-Watson:,2.031
Prob(Omnibus):,0.012,Jarque-Bera (JB):,8.931
Skew:,-0.23,Prob(JB):,0.0115
Kurtosis:,2.94,Cond. No.,2.9e+17


In [138]:
#Redefine formula and slr based on SLR and move on to the next step.
# define formula that will be used for the regression
formula = "writing ~ gender + lunch + testprep + race + parentaled"

# fit the model using ordinary least squares and store results in object called 'slr'
slr = smf.ols(formula,df).fit()

##Our regressional model cannot be graphed because the x axis has 5 dimensions. 
#The graph would be in 6-D space, and only 2-D and 3-D is easily representable on a computer.

## K Nearest Neighbors