In [None]:
import pandas as pd
import numpy as np

X_scaler_df_TOTdrop = pd.read_csv('../data/X_scaler_df_TOTdrop.csv')
X_scaler1 = pd.read_csv('../data/X_scaler1.csv')
high_schools = pd.read_csv('../data/high_schools.csv')

In [None]:
X_scaler_df_TOTdrop = X_scaler_df_TOTdrop.iloc[:,:]
X_scaler1 = X_scaler1.iloc[:,:]
y = high_schools['ACT']

In [None]:
from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(X_scaler_df_TOTdrop, y, test_size=.2, random_state=1)
X1_train, X1_val, y1_train, y1_val = train_test_split(X1_train, y1_train, test_size=0.25, random_state=1)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scaler1, y, test_size=.2, random_state=1)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2_train, y2_train, test_size=0.25, random_state=1)

In [None]:
from sklearn.tree import DecisionTreeRegressor

reg_tree = DecisionTreeRegressor()

spaceTree={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

In [None]:
from sklearn.model_selection import GridSearchCV

treeSearch = GridSearchCV(reg_tree, spaceTree, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1, verbose=3)

In [None]:
treeResult_TOT = treeSearch.fit(X1_val, y1_val)
treeResult = treeSearch.fit(X2_val, y2_val)

print('Best Score TOT: %s' % treeResult_TOT.best_score_)
print('Best Hyperparameters TOT: %s' % treeResult_TOT.best_params_)

print('Best Score TOT: %s' % treeResult.best_score_)
print('Best Hyperparameters TOT: %s' % treeResult.best_params_)

In [None]:
tuned_tree = DecisionTreeRegressor(max_depth=5, max_features='sqrt', max_leaf_nodes=90, min_samples_leaf=5, min_weight_fraction_leaf=0.1, splitter='best')

tuned_tree.fit(X1_train, y1_train)
tuned_tree_pred1 = tuned_tree.predict(X1_test)

import matplotlib.pyplot as plt

plt.scatter(y1_test, tuned_tree_pred1)

In [None]:
tuned_tree.fit(X2_train, y2_train)
tuned_tree_pred2 = tuned_tree.predict(X2_test)

plt.scatter(y2_test, tuned_tree_pred2)

In [None]:
from sklearn import metrics

print("Residual sum of squares: %.2f" % np.mean((tuned_tree_pred1 - y1_test) ** 2))
print('R2:', metrics.r2_score(y1_test,tuned_tree_pred1))
print('MAE:', metrics.mean_absolute_error(y1_test,tuned_tree_pred1))
print('MSE:', metrics.mean_squared_error(y1_test, tuned_tree_pred1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y1_test, tuned_tree_pred1)))

In [None]:
print("Residual sum of squares: %.2f" % np.mean((tuned_tree_pred2 - y2_test) ** 2))
print('R2:', metrics.r2_score(y2_test,tuned_tree_pred2))
print('MAE:', metrics.mean_absolute_error(y2_test,tuned_tree_pred2))
print('MSE:', metrics.mean_squared_error(y2_test, tuned_tree_pred2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2_test, tuned_tree_pred2)))

In [None]:
from sklearn.model_selection import cross_val_score

print("Cross validation mean MSE score using tuned_Tree1 is %s" % (
            -1 * cross_val_score(tuned_tree, X1_train, y1_train, cv=5,scoring='neg_mean_absolute_error').mean()))
print("Cross validation mean R2 score using tuned_Tree1 is %s" % (
    cross_val_score(tuned_tree, X1_train, y1_train, cv=5, scoring='r2').mean()))

print("Cross validation mean MSE score using tuned_Tree2 is %s" % (
            -1 * cross_val_score(tuned_tree, X2_train, y2_train, cv=5, scoring='neg_mean_absolute_error').mean()))
print("Cross validation mean R2 score using tuned_Tree2 is %s" % (
    cross_val_score(tuned_tree, X2_train, y2_train, cv=5, scoring='r2').mean()))

In [None]:
from sklearn.svm import SVR
from scipy.stats import uniform as sp_randFloat

SVR_model = SVR()
spaceSVR = {
    'kernel' : ['linear', 'rbf'],
    'C' : sp_randFloat(0,10),
    'gamma' : ["auto", "scale"]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

SVRSearch = RandomizedSearchCV(SVR_model, spaceSVR, n_iter=500, scoring='neg_mean_absolute_error', cv=3, random_state=0, n_jobs=-1, verbose = 3)

In [None]:
SVR_result_tot = SVRSearch.fit(X1_val, y1_val)
SVR_result = SVRSearch.fit(X2_val, y2_val)

print('Best Score TOT: %s' % SVR_result_tot.best_score_)
print('Best Hyperparameters TOT: %s' % SVR_result_tot.best_params_)

print('Best Score: %s' % SVR_result.best_score_)
print('Best Hyperparameters: %s' % SVR_result.best_params_)

In [None]:
tuned_SVR = SVR(kernel = 'rbf', C=5.7714, gamma = 'auto')

In [None]:
tuned_SVR.fit(X1_train, y1_train)
SVR_pred1 = tuned_SVR.predict(X1_test)

print("Residual sum of squares: %.2f" % np.mean((SVR_pred1 - y1_test) ** 2))
print('Variance score: %.2f' % tuned_SVR.score(X1_test, y1_test))

tuned_SVR.fit(X2_train, y2_train)
SVR_pred2 = tuned_SVR.predict(X2_test)

print("Residual sum of squares: %.2f" % np.mean((SVR_pred2 - y2_test) ** 2))
print('Variance score: %.2f' % tuned_SVR.score(X2_test, y2_test))

In [None]:
print("Cross validation mean MSE score using SVR1 is %s" % (
            -1 * cross_val_score(tuned_SVR, X1_train, y1_train, cv=5, scoring='neg_mean_absolute_error').mean()))
print("Cross validation mean R2 score using SVR1 is %s" % (
    cross_val_score(tuned_SVR, X1_train, y1_train, cv=5, scoring='r2').mean()))

print("Cross validation mean MSE score using SVR2 is %s" % (
            -1 * cross_val_score(tuned_SVR, X2_train, y2_train, cv=5, scoring='neg_mean_absolute_error').mean()))
print("Cross validation mean R2 score using SVR2 is %s" % (
    cross_val_score(tuned_SVR, X2_train, y2_train, cv=5, scoring='r2').mean()))