In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as cm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingRegressor


# Read in and split data into training and validation sets 
df = pd.read_csv("./data/y1314_df.csv")
df = df.drop(['intervention', 'latitude', 'longitude'], axis = 1)

dta = pd.read_csv("./data/Preds.csv")
dta = pd.merge(dta, df, on = 'school_id')
dta.head()

Unnamed: 0,school_id,prob_fail,prob_pass,mean
0,300897,35.8694,64.1306,29.388333
1,107371,70.903023,29.096975,33.152
2,305290,70.519974,29.480024,26.05
3,105374,51.682751,48.317249,37.872
4,106480,36.727261,63.272739,30.442


In [11]:
# Train Test split the data
msk = np.random.rand(len(dta)) < 0.8

train = dta[msk]
test = dta[~msk]

y_train = train.pop("mean")
x_train = train

y_test = test.pop("mean")
x_test = test

In [12]:
svr_regression = SVR()
svr_regressionFit = svr_regression.fit(x_train, y_train)

DT_regression = tree.DecisionTreeRegressor(random_state = 1693, max_depth = 3)
DT_regressionFit = DT_regression.fit(x_train, y_train)

RF_regression = RandomForestRegressor(n_estimators = 100, random_state = 1693)
RF_regressionFit = RF_regression.fit(x_train, y_train)

neigh = KNeighborsRegressor(n_neighbors=2)
neighFit = neigh.fit(x_train, y_train)

mlp = MLPRegressor()
mlpFit = mlp.fit(x_train, y_train)

regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regrFit = regr.fit(x_train, y_train)

clfRidge = Ridge(alpha=1.0)
clfRidgeFit = clfRidge.fit(x_train, y_train)

clfBayesian = linear_model.BayesianRidge()
clfBayesianFit = clfBayesian.fit(x_train, y_train)

reg = linear_model.LassoLars(alpha=0.01)
regFit = reg.fit(x_train, y_train)

bag = BaggingRegressor()
bagFit = bag.fit(x_train, y_train)



In [13]:
DT_MAD = mean_absolute_error(y_test, DT_regressionFit.predict(x_test))
SVR_MAD = mean_absolute_error(y_test, svr_regressionFit.predict(x_test))
KNN_MAD = mean_absolute_error(y_test, neighFit.predict(x_test))
MLP_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test))
regr_MAD = mean_absolute_error(y_test, regrFit.predict(x_test))
clfRidge_MAD = mean_absolute_error(y_test, clfRidgeFit.predict(x_test))
clfBayesion_MAD = mean_absolute_error(y_test, clfBayesianFit.predict(x_test))
reg_MAD = mean_absolute_error(y_test, regFit.predict(x_test))
bag_MAD = mean_absolute_error(y_test, bagFit.predict(x_test))


print('Regression Tree MAD: ' + str(DT_MAD))
print('Support Vector Regression MAD ' + str(SVR_MAD))
print('KNN MAD ' + str(KNN_MAD))
print('MLP MAD ' + str(MLP_MAD))
print('AdaBoost MAD ' + str(regr_MAD))
print('CLF Ridge MAD ' + str(clfRidge_MAD))
print('CLF Bayesion MAD ' + str(clfBayesion_MAD))
print('Reg MAD ' + str(reg_MAD))
print('Bag MAD ' + str(bag_MAD))

Regression Tree MAD: 3.706792096501902
Support Vector Regression MAD 4.911191358024689
KNN MAD 2.859814814814819
MLP MAD 2679.6807402710783
AdaBoost MAD 2.961223210258522
CLF Ridge MAD 3.9938774139707016
CLF Bayesion MAD 4.046353091786808
Reg MAD 3.9993825356186283
Bag MAD 3.656138888888893


In [16]:
to_pred = dta.drop(['mean'], axis = 1)
preds = RF_regressionFit.predict(to_pred)

df = pd.read_csv("./data/y1314_df.csv")
df = df.drop(['intervention', 'latitude', 'longitude'], axis = 1)

dta = pd.read_csv("./data/Preds.csv")
dta = pd.merge(dta, df, on = 'school_id')
dta.shape
dta.head()

final_df = pd.DataFrame()
final_df['school_id'] = dta['school_id']
final_df['actual_mean'] = dta['mean']
final_df['predicted_mean'] = preds.tolist()
final_df['error'] = abs(final_df['actual_mean'] - final_df['predicted_mean'])

final_df.head()

Unnamed: 0,school_id,actual_mean,predicted_mean,error
0,300897,29.388333,29.806803,0.41847
1,107371,33.152,27.18674,5.96526
2,305290,26.05,24.91005,1.13995
3,105374,37.872,32.35034,5.52166
4,106480,30.442,31.77224,1.33024


In [28]:
print("Mean Average Error: " + str(final_df['error'].mean()))
print("Mean Average Error Standard Deviation: " + str(final_df['error'].std()))
print("Overall True Mean: " + str(final_df['actual_mean'].mean()))
print("Predicted Overall Mean: " + str(final_df['predicted_mean'].mean()))
print("Overall Mean Standard Deviation: " + str(final_df['actual_mean'].std()))
print("Predicted Mean Standard Deviation: " + str(final_df['predicted_mean'].std()))

Mean Average Error: 1.993688099999998
Mean Average Error Standard Deviation: 1.6156571906158192
Overall True Mean: 28.136056666666672
Predicted Overall Mean: 27.86777630000001
Overall Mean Standard Deviation: 5.811106040528331
Predicted Mean Standard Deviation: 4.399466930960937


In [None]:
final_df.to_csv("./data/PredictedScores.csv")