In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import math
import requests
import json

url ='http://127.0.0.1:8000'

In [3]:
df = pd.read_csv("../data/test_scores.csv")
df.drop(['school', 'classroom', 'student_id', 'gender'], axis=1, inplace=True)
df

Unnamed: 0,school_setting,school_type,teaching_method,n_student,lunch,pretest,posttest
0,Urban,Non-public,Standard,20.0,Does not qualify,62.0,72.0
1,Urban,Non-public,Standard,20.0,Does not qualify,66.0,79.0
2,Urban,Non-public,Standard,20.0,Does not qualify,64.0,76.0
3,Urban,Non-public,Standard,20.0,Does not qualify,61.0,77.0
4,Urban,Non-public,Standard,20.0,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...
2128,Urban,Public,Standard,30.0,Does not qualify,39.0,55.0
2129,Urban,Public,Standard,30.0,Qualifies for reduced/free lunch,38.0,46.0
2130,Urban,Public,Standard,30.0,Qualifies for reduced/free lunch,45.0,51.0
2131,Urban,Public,Standard,30.0,Qualifies for reduced/free lunch,46.0,53.0


In [4]:
y = df['posttest'].values
y

array([72., 79., 76., ..., 51., 53., 48.])

In [6]:
x = df[['school_setting', 'school_type', 'teaching_method', 'n_student',
       'lunch']].values
x

array([['Urban', 'Non-public', 'Standard', 20.0, 'Does not qualify'],
       ['Urban', 'Non-public', 'Standard', 20.0, 'Does not qualify'],
       ['Urban', 'Non-public', 'Standard', 20.0, 'Does not qualify'],
       ...,
       ['Urban', 'Public', 'Standard', 30.0,
        'Qualifies for reduced/free lunch'],
       ['Urban', 'Public', 'Standard', 30.0,
        'Qualifies for reduced/free lunch'],
       ['Urban', 'Public', 'Standard', 30.0,
        'Qualifies for reduced/free lunch']], dtype=object)

In [7]:
enconding = OneHotEncoder()

In [8]:
enconding.fit(x)
enconding.categories_

[array(['Rural', 'Suburban', 'Urban'], dtype=object),
 array(['Non-public', 'Public'], dtype=object),
 array(['Experimental', 'Standard'], dtype=object),
 array([14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0,
        25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0], dtype=object),
 array(['Does not qualify', 'Qualifies for reduced/free lunch'],
       dtype=object)]

In [9]:
x = enconding.transform(x).toarray()
x

array([[0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.]])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [11]:
svmR = svm.SVR()
svmR.fit(X_train, y_train)

score = svmR.score(X_test, y_test)
rmse = math.sqrt(metrics.mean_squared_error(y_test, svmR.predict(X_test)))
mae = metrics.mean_absolute_error(y_test, svmR.predict(X_test))
rmsle = np.log(np.sqrt(metrics.mean_squared_error(y_test, svmR.predict(X_test))))

print('Support Vector Machine')
print('Score: ', score)
print('Root mean squared error: ', rmse)
print('Mean absolute error: ', mae)
print("Root mean squared log error", rmsle)

data = { "svm": {
    "score": score,
    "rmse": rmse,
    "mae": mae,
    "rmsle": rmsle
    }
}

Support Vector Machine
Score:  0.7826118625563451
Root mean squared error:  6.591481267328312
Mean absolute error:  5.032668405653811
Root mean squared log error 1.8857780982681813


In [12]:
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train, y_train)

score = knn.score(X_test, y_test)
rmse = math.sqrt(metrics.mean_squared_error(y_test, knn.predict(X_test)))
mae = metrics.mean_absolute_error(y_test, knn.predict(X_test))
rmsle = np.log(np.sqrt(metrics.mean_squared_error(y_test, knn.predict(X_test))))

print('K-Nearest Neighbors Regressor')
print('Score: ', score)
print('Root mean squared error: ', rmse)
print('Mean absolute error: ', mae)
print("Root mean squared log error", rmsle)

data["knn"] = {
    "score": score,
    "rmse": rmse,
    "mae": mae,
    "rmsle": rmsle
}

K-Nearest Neighbors Regressor
Score:  0.8429196590465041
Root mean squared error:  5.60306962338908
Mean absolute error:  4.149857954545454
Root mean squared log error 1.7233145945971597


In [13]:
gpr = GaussianProcessRegressor()
gpr.fit(X_train, y_train)

score = gpr.score(X_test, y_test)
rmse = math.sqrt(metrics.mean_squared_error(y_test, gpr.predict(X_test)))
mae = metrics.mean_absolute_error(y_test, gpr.predict(X_test))
rmsle = np.log(np.sqrt(metrics.mean_squared_error(y_test, gpr.predict(X_test))))

print('Gaussian Process Regressor')
print('Score: ', score)
print('Root mean squared error: ', rmse)
print('Mean absolute error: ', mae)
print("Root mean squared log error", rmsle)

data["gpr"] = {
    "score": score,
    "rmse": rmse,
    "mae": mae,
    "rmsle": rmsle
}

Gaussian Process Regressor
Score:  0.8627561407325227
Root mean squared error:  5.2373495722204355
Mean absolute error:  3.8992105939171533
Root mean squared log error 1.6558155635413643


In [14]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

score = rfr.score(X_test, y_test)
rmse = math.sqrt(metrics.mean_squared_error(y_test, rfr.predict(X_test)))
mae = metrics.mean_absolute_error(y_test, rfr.predict(X_test))
rmsle = np.log(np.sqrt(metrics.mean_squared_error(y_test, rfr.predict(X_test))))

print('Random Forest Regressor')
print('Score: ', score)
print('Root mean squared error: ', rmse)
print('Mean absolute error: ', mae)
print("Root mean squared log error", rmsle)


data["rfr"] = {
    "score": score,
    "rmse": rmse,
    "mae": mae,
    "rmsle": rmsle
}
data = json.dumps(data)
headers = {
        "accept": "application/json",
        "Content-Type": "application/json" 
}
print(requests.post(url + "/test-scores-of-students/models", data=data, headers=headers).text)

Random Forest Regressor
Score:  0.8585431321303344
Root mean squared error:  5.317128060894607
Mean absolute error:  3.9630040507205395
Root mean squared log error 1.6709333194107565
{"message":"Insert datas on test-scores-of-students -> models works!"}
