In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model, tree, neighbors, ensemble
from sklearn import svm
from collections import Counter

In [2]:
#open and read data

In [3]:
address = '/Users/armenhakobyan/Boston_housing_project/dataset/boston_new.csv'
col_names = ['CRIM', 'ZN', 'INDUS', 'CHAS','NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv(address, delim_whitespace=True, header=None, names=col_names)

In [4]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
X = data.drop('MEDV', axis=1)
new_X = X.drop(['CRIM', 'ZN', 'INDUS', 'CHAS','NOX', 'RAD', 'B',  'DIS', 'TAX'], axis=1)
y = data['MEDV']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [11]:
classifiers = [
    svm.SVR(),
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression(),
    tree.DecisionTreeRegressor(),
    tree.ExtraTreeRegressor(),
    neighbors.KNeighborsRegressor(),
    ensemble.RandomForestRegressor()]

In [189]:
#try with all features

In [190]:
models = {}
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train, y_train)
    the_score = clf.score(X_test, y_test)
    models[item] = the_score
    print(the_score,'\n')
print(f'The best model is {max(models, key=models.get)}, and it has {max(models.values())} score!!!')

SVR()
0.6159965828718288 

SGDRegressor()
0.6401877652777109 

BayesianRidge()
0.6716708849740622 

LassoLars()
-0.023340500652033302 

ARDRegression()
0.6631474191046512 

PassiveAggressiveRegressor()
0.634883024585069 

TheilSenRegressor(max_subpopulation=10000)
0.6212301368661326 

LinearRegression()
0.668759493535632 

DecisionTreeRegressor()
0.8334234816741452 

ExtraTreeRegressor()
0.5496417999323376 

KNeighborsRegressor()
0.7047836290403378 

RandomForestRegressor()
0.8834185272770425 

The best model is RandomForestRegressor(), and it has 0.8834185272770425 score!!!


In [191]:
#try with 4 features

In [192]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(new_X, y, test_size=0.2, random_state = 42)

In [193]:
models = {}
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train_new, y_train_new)
    the_score = clf.score(X_test_new, y_test_new)
    models[item] = the_score
    print(the_score,'\n')
print(f'The best model is {max(models, key=models.get)}, and it has {max(models.values())} score!!!')

SVR()
0.64964255832141 

SGDRegressor()
0.6597678978232618 

BayesianRidge()
0.6669074104032202 

LassoLars()
-0.023340500652033302 

ARDRegression()
0.6631430331242688 

PassiveAggressiveRegressor()
-0.31450946885657083 

TheilSenRegressor(max_subpopulation=10000)
0.5188152800032431 

LinearRegression()
0.6687594935356318 

DecisionTreeRegressor()
0.6900807780512199 

ExtraTreeRegressor()
0.6805942052258567 

KNeighborsRegressor()
0.7183109257429846 

RandomForestRegressor()
0.8817288356362709 

The best model is RandomForestRegressor(), and it has 0.8817288356362709 score!!!


In [194]:
#scale 4 features with minmax

In [7]:
from sklearn import preprocessing


In [196]:
mm_scaler = preprocessing.MinMaxScaler()

new_X = mm_scaler.fit_transform(new_X)
new_X_scaled = pd.DataFrame(new_X)
new_X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,1.0,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.0,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.0,0.099338


In [197]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(new_X_scaled, y, test_size=0.2, random_state = 42)

In [198]:
models = {}
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train_scaled, y_train_scaled)
    the_score = clf.score(X_test_scaled, y_test_scaled)
    models[item] = the_score
    print(the_score,'\n')
print(f'The best model is {max(models, key=models.get)}, and it has {max(models.values())} score!!!')



SVR()
0.6159965828718288 

SGDRegressor()
0.6405529068787609 

BayesianRidge()
0.6716708849740621 

LassoLars()
-0.023340500652033302 

ARDRegression()
0.663147419104652 

PassiveAggressiveRegressor()
0.43723664283785935 

TheilSenRegressor(max_subpopulation=10000)
0.6299084179396455 

LinearRegression()
0.668759493535632 

DecisionTreeRegressor()
0.8644794409093748 

ExtraTreeRegressor()
0.7910480731558898 

KNeighborsRegressor()
0.7047836290403378 

RandomForestRegressor()
0.882416710026472 

The best model is RandomForestRegressor(), and it has 0.882416710026472 score!!!


In [199]:
#scale 4 features with std

In [13]:
std_scaler = preprocessing.StandardScaler()

new_X = std_scaler.fit_transform(new_X)
new_X_scaled = pd.DataFrame(new_X)
new_X_scaled.head()

Unnamed: 0,0,1,2,3
0,0.413672,-0.120013,-1.459,-1.075562
1,0.194274,0.367166,-0.303094,-0.492439
2,1.282714,-0.265812,-0.303094,-1.208727
3,1.016303,-0.809889,0.113032,-1.361517
4,1.228577,-0.51118,0.113032,-1.026501


In [14]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(new_X_scaled, y, test_size=0.2, random_state = 42)

In [15]:
models = {}
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train_scaled, y_train_scaled)
    the_score = clf.score(X_test_scaled, y_test_scaled)
    models[item] = the_score
    print(the_score,'\n')
print(f'The best model is {max(models, key=models.get)}, and it has {max(models.values())} score!!!')


SVR()
0.7124347544567338 

SGDRegressor()
0.6296564544086191 

BayesianRidge()
0.6305708962512246 

LassoLars()
-0.023340500652033302 

ARDRegression()
0.6305927513540971 

PassiveAggressiveRegressor()
0.5824445638418285 

TheilSenRegressor(max_subpopulation=10000)
0.6155814022153663 

LinearRegression()
0.6298973868568118 

DecisionTreeRegressor()
0.5464640119382722 

ExtraTreeRegressor()
0.5341164940951133 

KNeighborsRegressor()
0.8016670989278298 

RandomForestRegressor()
0.8095693909631572 

The best model is RandomForestRegressor(), and it has 0.8095693909631572 score!!!


In [203]:
#try scaling all features

In [204]:
#minmax

In [205]:
X = mm_scaler.fit_transform(X)
X_scaled = pd.DataFrame(X)
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,1.0,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.0,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.0,0.099338


In [206]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.2, random_state = 42)

In [207]:
models = {}
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train_scaled, y_train_scaled)
    the_score = clf.score(X_test_scaled, y_test_scaled)
    models[item] = the_score
    print(the_score,'\n')
print(f'The best model is {max(models, key=models.get)}, and it has {max(models.values())} score!!!')


SVR()
0.6159965828718288 

SGDRegressor()
0.641645092847593 

BayesianRidge()
0.6716708849740627 

LassoLars()
-0.023340500652033302 

ARDRegression()
0.6631474191046507 

PassiveAggressiveRegressor()
0.5929442880744036 

TheilSenRegressor(max_subpopulation=10000)
0.6145932562592684 

LinearRegression()
0.6687594935356318 

DecisionTreeRegressor()
0.8574794838471421 

ExtraTreeRegressor()
0.668872351994066 

KNeighborsRegressor()
0.7047836290403378 

RandomForestRegressor()
0.8923766177826213 

The best model is RandomForestRegressor(), and it has 0.8923766177826213 score!!!


In [208]:
#standard

In [13]:
std_scaler = preprocessing.StandardScaler()

new_X = std_scaler.fit_transform(X)
new_X_scaled = pd.DataFrame(new_X)
new_X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


In [14]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(new_X_scaled, y, test_size=0.2, random_state = 42)

In [16]:
models = {}
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train_scaled, y_train_scaled)
    the_score = clf.score(X_test_scaled, y_test_scaled)
    models[item] = the_score
    print(the_score,'\n')

print(f'The best model is {max(models, key=models.get)}, and it has {max(models.values())} score!!!')


SVR()
0.6496425583214102 

SGDRegressor()
0.6594383523834899 

BayesianRidge()
0.66690741040322 

LassoLars()
-0.023340500652033302 

ARDRegression()
0.6631430331242701 

PassiveAggressiveRegressor()
0.554598890475891 

TheilSenRegressor(max_subpopulation=10000)
0.5143386978905067 

LinearRegression()
0.6687594935356318 

DecisionTreeRegressor()
0.7154415850547631 

ExtraTreeRegressor()
0.6600020855487045 

KNeighborsRegressor()
0.7183109257429846 

RandomForestRegressor()
0.8689828865472514 

The best model is RandomForestRegressor(), and it has 0.8689828865472514 score!!!
