In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, svm, tree
from sklearn.ensemble import RandomForestRegressor
from  sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [3]:
data = pd.read_csv("Caschool.csv")

In [4]:
data

Unnamed: 0.1,Unnamed: 0,distcod,county,district,grspan,enrltot,teachers,calwpct,mealpct,computer,testscr,compstu,expnstu,str,avginc,elpct,readscr,mathscr
0,1,75119,Alameda,Sunol Glen Unified,KK-08,195,10.900000,0.510200,2.040800,67,690.799988,0.343590,6384.911133,17.889910,22.690001,0.000000,691.599976,690.000000
1,2,61499,Butte,Manzanita Elementary,KK-08,240,11.150000,15.416700,47.916698,101,661.200012,0.420833,5099.380859,21.524664,9.824000,4.583333,660.500000,661.900024
2,3,61549,Butte,Thermalito Union Elementary,KK-08,1550,82.900002,55.032299,76.322601,169,643.599976,0.109032,5501.954590,18.697226,8.978000,30.000002,636.299988,650.900024
3,4,61457,Butte,Golden Feather Union Elementary,KK-08,243,14.000000,36.475399,77.049202,85,647.700012,0.349794,7101.831055,17.357143,8.978000,0.000000,651.900024,643.500000
4,5,61523,Butte,Palermo Union Elementary,KK-08,1335,71.500000,33.108601,78.427002,171,640.849976,0.128090,5235.987793,18.671329,9.080333,13.857677,641.799988,639.900024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,416,68957,San Mateo,Las Lomitas Elementary,KK-08,984,59.730000,0.101600,3.556900,195,704.300049,0.198171,7290.338867,16.474134,28.716999,5.995935,700.900024,707.700012
416,417,69518,Santa Clara,Los Altos Elementary,KK-08,3724,208.479996,1.074100,1.503800,721,706.750000,0.193609,5741.462891,17.862625,41.734108,4.726101,704.000000,709.500000
417,418,72611,Ventura,Somis Union Elementary,KK-08,441,20.150000,3.563500,37.193802,45,645.000000,0.102041,4402.831543,21.885857,23.733000,24.263039,648.299988,641.700012
418,419,72744,Yuba,Plumas Elementary,KK-08,101,5.000000,11.881200,59.405899,14,672.200012,0.138614,4776.336426,20.200001,9.952000,2.970297,667.900024,676.500000


In [9]:
def prepare_data():
    data = pd.read_csv("Caschool.csv")
    data = data.drop(["Unnamed: 0",	"distcod", "county", "district", "grspan","readscr","mathscr"], axis=1)
    
    
    x = data.drop('testscr', axis=1)
    
    y = data['testscr']

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.33, random_state=42)
    return x_train, x_test, y_train, y_test

In [10]:
def scale_data(x_train, x_test, scaler=StandardScaler):

    scaler = scaler()
    scaler.fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_train_scaled = pd.DataFrame(x_train_scaled, index=x_train.index, columns=x_train.columns)

    x_test_scaled = scaler.transform(x_test)
    x_test_scaled = pd.DataFrame(x_test_scaled, index=x_test.index, columns=x_test.columns)

    x_train_scaled.fillna(x_train_scaled.mean(), inplace=True)
    x_test_scaled.fillna(x_test_scaled.mean(), inplace=True)

    return x_train_scaled, x_test_scaled



In [11]:
def evaluate_regression(y_true_train, y_pred_train, y_true_test, y_pred_test, model_name=""):   
    return pd.DataFrame.from_records([[metrics.mean_squared_error(y_true_train, y_pred_train, squared=False),
                                       metrics.mean_squared_error(y_true_train, y_pred_train),
                                       metrics.mean_absolute_error(y_true_train, y_pred_train),
                                       metrics.max_error(y_true_train, y_pred_train),
                                       metrics.r2_score(y_true_train, y_pred_train), 
                                       metrics.mean_squared_error(y_true_test, y_pred_test,  squared=False),
                                      metrics.mean_squared_error(y_true_test, y_pred_test),
                                       metrics.mean_absolute_error(y_true_test, y_pred_test),
                                       metrics.max_error(y_true_test, y_pred_test),
                                       metrics.r2_score(y_true_test, y_pred_test)]], 
                                     
                                     index=[model_name], 
                                     columns=['RMSE_train','mean_squared_error_train', 'mean_absolute_error_train', 'max_error_train', "r2_score_train",
                                         'RMSE_test', 'mean_squared_error_test', 'mean_absolute_error_test', 'max_error_test', "r2_score_test"])


In [12]:
x_train, x_test, y_train, y_test = prepare_data()
x_train, x_test = scale_data(x_train, x_test, scaler=StandardScaler)

In [24]:
results = pd.DataFrame()
for model in [linear_model.LinearRegression(), 
            linear_model.RidgeCV(),
            linear_model.LassoCV(),
            svm.SVR(kernel="linear"),
            svm.SVR(kernel="rbf"),
            svm.SVR(kernel="poly"),
            tree.DecisionTreeRegressor()]:
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                        model_name=model))


In [25]:
results.sort_values("RMSE_test")

Unnamed: 0,RMSE_train,mean_squared_error_train,mean_absolute_error_train,max_error_train,r2_score_train,RMSE_test,mean_squared_error_test,mean_absolute_error_test,max_error_test,r2_score_test
LinearRegression(),7.960069,63.362696,6.27489,27.359264,0.798807,9.265519,85.849847,6.994132,31.901941,0.812076
"RidgeCV(alphas=array([ 0.1, 1. , 10. ]))",7.97181,63.54975,6.296469,26.374902,0.798213,9.349598,87.414988,7.068692,33.241578,0.80865
LassoCV(),8.016672,64.267037,6.315642,26.994668,0.795936,9.383675,88.053364,7.035004,31.494123,0.807253
SVR(kernel='linear'),8.043761,64.702088,6.208559,27.423716,0.794554,9.459345,89.47921,7.165713,30.515253,0.804132
DecisionTreeRegressor(),0.0,0.0,0.0,0.0,1.0,12.67908,160.75906,9.966904,39.899963,0.648101
SVR(kernel='poly'),11.093526,123.066322,8.843384,33.019295,0.609233,14.448829,208.76865,11.97217,47.305124,0.543009
SVR(),11.383172,129.576597,8.498866,42.221546,0.588561,14.569708,212.276394,10.860044,41.090381,0.535331


In [28]:
results.sort_values("RMSE_test").iloc[0]["RMSE_test"]

9.265519242459682