# Práctica 26-11-2022
**Nava del Río José Antonio**  
**Ojeda Contreras Braulio Melquisedec**  
**Suárez Pérez Juan Pablo**

In [1]:
# Import the libraries needed.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Create a data.frame with the file given.
df = pd.read_csv('./cal_housing.csv', sep=',', engine='python')

In [3]:
# Corpus without tags.
X = df.drop('medianHouseValue', axis=1).values
# Tags.
y = df['medianHouseValue'].values

In [4]:
# Generate Data Test and Data Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:
class validation_set:
    
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

In [6]:
validation_sets = []
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X_train):
    X_train_v, X_test_v = X_train[train_index], X_train[test_index]
    y_train_v, y_test_v = y_train[train_index], y_train[test_index]
    validation_sets.append(validation_set(X_train_v, y_train_v, X_test_v, y_test_v))

In [7]:
regression_type_list = list()
scale_type_list = list()
learning_rate_list = list()
eta0_list = list()
iterations_list = list()
mse_list = list()
r2_list = list()

In [8]:
def linear_regression(X_train, X_test, y_train, y_test, eta0 = 0.001, learning_rate = 'constant', iterations = 10000, standar_scale = False, robust_scale = False):
    if standar_scale:
        X_train = preprocessing.StandardScaler().fit_transform(X_train)
        X_test = preprocessing.StandardScaler().fit_transform(X_test)
    if robust_scale:
        X_train = preprocessing.RobustScaler().fit_transform(X_train)
        X_test = preprocessing.RobustScaler().fit_transform(X_test)
    regr = SGDRegressor(learning_rate = learning_rate, eta0 = eta0, max_iter = iterations)
    regr.fit(X_train, y_train)
    y_test_pred = regr.predict(X_test)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    return [mse, r2]

In [9]:
def polynomial_regression(X_train, X_test, y_train, y_test, degree = 2, eta0 = 0.001, learning_rate = 'constant', iterations = 10000, standar_scale = False, robust_scale = False):
    polynomial_features= PolynomialFeatures(degree = degree)
    X_train = polynomial_features.fit_transform(X_train)
    X_test = polynomial_features.fit_transform(X_test)
    if standar_scale:
        X_train = preprocessing.StandardScaler().fit_transform(X_train)
        X_test = preprocessing.StandardScaler().fit_transform(X_test)
    if robust_scale:
        X_train = preprocessing.RobustScaler().fit_transform(X_train)
        X_test = preprocessing.RobustScaler().fit_transform(X_test)
    regr = SGDRegressor(learning_rate = learning_rate, eta0 = eta0, max_iter = iterations)
    regr.fit(X_train, y_train)
    y_test_pred = regr.predict(X_test)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    return [mse, r2]

In [21]:
def get_best_params(validation_sets, eta0, learnin_rate, iterations, type_regression, scale_type):
    c = 1
    mses = list()
    r2s = list()
    n = len(validation_sets)
    for validation_set in validation_sets:
        if scale_type == 'none':
            if type_regression == 1:
                mse, r2 = linear_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, eta0, learning_rate, iterations)
            else:
                mse, r2 = polynomial_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, type_regression, eta0, learning_rate, iterations)
        elif scale_type == 'std':
            if type_regression == 1:
                mse, r2 = linear_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, eta0, learning_rate, iterations, standar_scale = True)
            else:
                mse, r2 = polynomial_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, type_regression, eta0, learning_rate, iterations, standar_scale = True)
        elif scale_type == 'robust':
            if type_regression == 1:
                mse, r2 = linear_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, eta0, learning_rate, iterations, robust_scale = True)
            else:
                mse, r2 = polynomial_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, type_regression, eta0, learning_rate, iterations, robust_scale = True)
        print('kfold:', c)
        print('\tmse:', mse)
        print('\tr2:', r2)
        c = c + 1
        mses.append(mse)
        r2s.append(r2)
    mses_mean = sum(mses) / n
    r2s_mean = sum(r2s) / n
    return [mses_mean, r2s_mean]

In [11]:
scale_types = ['none', 'std', 'robust']
type_resgressions = [1, 2, 3]
# learning_rates = ['constant', 'optimal', 'invscaling', 'adaptive']
learning_rates = ['constant', 'adaptive']
eta0s = [0.0001, 0.00001]
iterationss = [100000, 200000]

In [12]:
for scale_type in scale_types:
    for type_resgression in type_resgressions:
        for learning_rate in learning_rates:
            for eta0 in eta0s:
                for iterations in iterationss:
                    print(scale_type)
                    print('\t', type_resgression)
                    print('\t\t', learning_rate)
                    print('\t\t\t', eta0)
                    print('\t\t\t\t', iterations)
                    mses_mean, r2s_mean = get_best_params(validation_sets, eta0, learning_rate, iterations, type_resgression, scale_type)
                    scale_type_list.append(scale_type)
                    regression_type_list.append(type_resgression)
                    learning_rate_list.append(learning_rate)
                    eta0_list.append(eta0)
                    iterations_list.append(iterations)
                    mse_list.append(mses_mean)
                    r2_list.append(r2s_mean)
                    print("")
                    print("")

none
	 1
		 constant
			 0.0001
				 100000
kfold: 1
	mse: 2.4059430624143483e+29
	r2: -1.7331055307828695e+19
kfold: 2
	mse: 1.5959685838485258e+30
	r2: -1.1880601027749347e+20
kfold: 3
	mse: 7.98207845258883e+29
	r2: -5.925104750670047e+19
kfold: 4
	mse: 4.501432936408796e+29
	r2: -3.3279854627121676e+19
kfold: 5
	mse: 1.0002934464282536e+30
	r2: -7.323480672691337e+19
kfold: 6
	mse: 2.0544456552995623e+30
	r2: -1.4982830076001514e+20
kfold: 7
	mse: 1.5733597775274287e+30
	r2: -1.2334184570782358e+20
kfold: 8
	mse: 4.6899720985515426e+28
	r2: -3.497584871378543e+18
kfold: 9
	mse: 2.4701092164362472e+29
	r2: -1.9525542571003507e+19
kfold: 10
	mse: 2.2819083354811933e+30
	r2: -1.7172137152230556e+20


none
	 1
		 constant
			 0.0001
				 200000
kfold: 1
	mse: 8.936466965378732e+29
	r2: -6.437326204974419e+19
kfold: 2
	mse: 5.535283436129087e+28
	r2: -4.120538132497521e+18
kfold: 3
	mse: 2.1829109822465804e+29
	r2: -1.6203769867739339e+19
kfold: 4
	mse: 9.223792280364837e+29
	r2: -6.819

kfold: 10
	mse: 5.737893761782941e+44
	r2: -4.3179604154207635e+34


none
	 2
		 adaptive
			 0.0001
				 100000
kfold: 1
	mse: 1.2270788879740749e+44
	r2: -8.839183439863601e+33
kfold: 2
	mse: 6.826820309292708e+39
	r2: -5.081975247110562e+29
kfold: 3
	mse: 3.0282127026184315e+40
	r2: -2.2478452920372593e+30
kfold: 4
	mse: 8.640130292374037e+42
	r2: -6.387794379071589e+32
kfold: 5
	mse: 1.0180105426118482e+43
	r2: -7.453193420425584e+32
kfold: 6
	mse: 2.225932235494292e+41
	r2: -1.623346149803229e+31
kfold: 7
	mse: 2.6159630500242183e+43
	r2: -2.0507560667434802e+33
kfold: 8
	mse: 1.2978759014422371e+42
	r2: -9.679015189052237e+31
kfold: 9
	mse: 5.710185750361764e+42
	r2: -4.513746769379089e+32
kfold: 10
	mse: 7.7093895576235e+43
	r2: -5.801578125861395e+33


none
	 2
		 adaptive
			 0.0001
				 200000
kfold: 1
	mse: 2.153458130005129e+41
	r2: -1.5512296420165557e+31
kfold: 2
	mse: 6.444919191673882e+41
	r2: -4.797683008754605e+31
kfold: 3
	mse: 3.490772486925784e+40
	r2: -2.591203878

kfold: 7
	mse: 1.5029439690110776e+62
	r2: -1.178216742165551e+52
kfold: 8
	mse: 2.8963842689322245e+57
	r2: -2.160002146674779e+47
kfold: 9
	mse: 2.239139078911867e+61
	r2: -1.769978635631703e+51
kfold: 10
	mse: 3.6035272748586926e+58
	r2: -2.711776964635531e+48


std
	 1
		 constant
			 0.0001
				 100000
kfold: 1
	mse: 5097848145.197456
	r2: 0.6327798045782957
kfold: 2
	mse: 4486231207.312395
	r2: 0.6660390208697524
kfold: 3
	mse: 5016463933.119972
	r2: 0.6276273847940694
kfold: 4
	mse: 4608161093.499088
	r2: 0.659310860660392
kfold: 5
	mse: 5451240472.090398
	r2: 0.6008965730796654
kfold: 6
	mse: 4824980494.014352
	r2: 0.6481198581458809
kfold: 7
	mse: 4619646950.0841255
	r2: 0.637847751365971
kfold: 8
	mse: 4626595607.397095
	r2: 0.6549678662818454
kfold: 9
	mse: 5106669757.545335
	r2: 0.5963316233811515
kfold: 10
	mse: 4400934878.059574
	r2: 0.6688146664395367


std
	 1
		 constant
			 0.0001
				 200000
kfold: 1
	mse: 5099018474.854782
	r2: 0.632695500638039
kfold: 2
	mse: 44805

kfold: 10
	mse: 3890129427.4063873
	r2: 0.7072545157548413


std
	 2
		 adaptive
			 0.0001
				 200000
kfold: 1
	mse: 4548476915.474729
	r2: 0.6723534059473157
kfold: 2
	mse: 4178412999.8802543
	r2: 0.6889534149786865
kfold: 3
	mse: 4629241854.840863
	r2: 0.6563709180630486
kfold: 4
	mse: 4280555147.0196567
	r2: 0.6835313220730953
kfold: 5
	mse: 5142454597.922449
	r2: 0.6235038128805838
kfold: 6
	mse: 4327408253.853321
	r2: 0.6844072153005344
kfold: 7
	mse: 4181464857.075613
	r2: 0.6721985647525455
kfold: 8
	mse: 4333545002.3407345
	r2: 0.6768223537128049
kfold: 9
	mse: 4589692554.921667
	r2: 0.6371972673409365
kfold: 10
	mse: 3880205726.7204204
	r2: 0.7080013080189625


std
	 2
		 adaptive
			 1e-05
				 100000
kfold: 1
	mse: 4517344721.86397
	r2: 0.6745959933873554
kfold: 2
	mse: 4178091991.305679
	r2: 0.6889773112811551
kfold: 3
	mse: 4607968604.794089
	r2: 0.6579500335235517
kfold: 4
	mse: 4258988141.3719654
	r2: 0.6851258072577842
kfold: 5
	mse: 5143114500.373922
	r2: 0.623455499

kfold: 1
	mse: 5145230798.047893
	r2: 0.6293666258127164
kfold: 2
	mse: 4620364417.645896
	r2: 0.6560539674503377
kfold: 3
	mse: 5041373415.8752575
	r2: 0.6257783514190316
kfold: 4
	mse: 4631520919.069672
	r2: 0.6575838292681968
kfold: 5
	mse: 5513476481.578637
	r2: 0.5963400680434678
kfold: 6
	mse: 4894810046.212113
	r2: 0.6430272711885988
kfold: 7
	mse: 4620609435.221405
	r2: 0.6377722983799509
kfold: 8
	mse: 4635095689.908009
	r2: 0.6543339657090718
kfold: 9
	mse: 5442266690.845088
	r2: 0.5698035971536453
kfold: 10
	mse: 4450727832.280802
	r2: 0.6650675771029209


robust
	 1
		 constant
			 1e-05
				 200000
kfold: 1
	mse: 5155267020.740279
	r2: 0.6286436729994072
kfold: 2
	mse: 4624316704.398634
	r2: 0.6557597540019544
kfold: 3
	mse: 5038756209.211239
	r2: 0.6259726269292332
kfold: 4
	mse: 4633184666.638806
	r2: 0.6574608255979895
kfold: 5
	mse: 5522637445.497378
	r2: 0.5956693634373178
kfold: 6
	mse: 4896010199.361614
	r2: 0.6429397454336208
kfold: 7
	mse: 4621643906.994319
	r2: 0

kfold: 2
	mse: 4568852454.3167305
	r2: 0.659888586067918
kfold: 3
	mse: 4877725772.421532
	r2: 0.6379259322204849
kfold: 4
	mse: 4607101218.158467
	r2: 0.6593892190359516
kfold: 5
	mse: 5313755535.975396
	r2: 0.6109623020516948
kfold: 6
	mse: 4518574469.584421
	r2: 0.6704656884502966
kfold: 7
	mse: 4339787567.98831
	r2: 0.6597870262981136
kfold: 8
	mse: 4460940396.135839
	r2: 0.6673217385138641
kfold: 9
	mse: 8482738175.262161
	r2: 0.32946258304027176
kfold: 10
	mse: 4151245113.245197
	r2: 0.6876046713675678


robust
	 3
		 constant
			 0.0001
				 100000
kfold: 1
	mse: 1.9449085696680035e+24
	r2: -140100231448373.72
kfold: 2
	mse: 7.664210592074613e+24
	r2: -570533963879899.6
kfold: 3
	mse: 8.254312210040481e+22
	r2: -6127184138782.101
kfold: 4
	mse: 4.297710318385489e+27
	r2: -3.177369887453121e+17
kfold: 5
	mse: 5.996024059572915e+23
	r2: -43898884342458.88
kfold: 6
	mse: 2.8950110592022055e+24
	r2: -211129745176184.5
kfold: 7
	mse: 4.060183513253872e+26
	r2: -3.1829371488332572e+16

In [15]:
data = {'scale type': scale_type_list,
        'regression type': regression_type_list,
        'learning rate': learning_rate_list,
        'eta0': eta0_list,
        'iterations': iterations_list,
        'mse':mse_list,
        'r2':r2_list}
df1 = pd.DataFrame(data)
df1

Unnamed: 0,scale type,regression type,learning rate,eta0,iterations,mse,r2
0,none,1,constant,0.00010,100000,1.028883e+30,1.028883e+30
1,none,1,constant,0.00010,200000,5.278205e+30,5.278205e+30
2,none,1,constant,0.00001,100000,1.752569e+28,1.752569e+28
3,none,1,constant,0.00001,200000,7.359596e+28,7.359596e+28
4,none,1,adaptive,0.00010,100000,6.010441e+26,6.010441e+26
...,...,...,...,...,...,...,...
67,robust,3,constant,0.00001,200000,2.187125e+24,2.187125e+24
68,robust,3,adaptive,0.00010,100000,3.550587e+22,3.550587e+22
69,robust,3,adaptive,0.00010,200000,2.464878e+22,2.464878e+22
70,robust,3,adaptive,0.00001,100000,8.185364e+21,8.185364e+21


In [16]:
df1.to_csv('results.csv')

In [22]:
best = df1.sort_values('mse')

In [23]:
best

Unnamed: 0,scale type,regression type,learning rate,eta0,iterations,mse,r2
42,std,3,constant,0.00001,100000,4.256180e+09,4.256180e+09
43,std,3,constant,0.00001,200000,4.259497e+09,4.259497e+09
47,std,3,adaptive,0.00001,200000,4.283379e+09,4.283379e+09
46,std,3,adaptive,0.00001,100000,4.284115e+09,4.284115e+09
38,std,2,adaptive,0.00001,100000,4.398732e+09,4.398732e+09
...,...,...,...,...,...,...,...
22,none,3,adaptive,0.00001,100000,6.404439e+61,6.404439e+61
19,none,3,constant,0.00001,200000,1.805351e+63,1.805351e+63
18,none,3,constant,0.00001,100000,9.149612e+63,9.149612e+63
16,none,3,constant,0.00010,100000,9.849710e+65,9.849710e+65


In [32]:
mse, r2 = polynomial_regression(X_train, X_test, y_train, y_test, degree = 3, eta0 = 0.00001, learning_rate = 'constant', iterations = 100000, standar_scale = True)

In [33]:
print('mse with X_train and X_test:', mse)
print('r2 with X_train and X_test:', r2)

mse with X_train and X_test: 4291802343.956756
r2 with X_train and X_test: 0.6708626704544797
