# Práctica 26-11-2022
**Nava del Río José Antonio**  
**Ojeda Contreras Braulio Melquisedec**  
**Suárez Pérez Juan Pablo**

In [1]:
# Import the libraries needed.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Create a data.frame with the file given.
df = pd.read_csv('./cal_housing.csv', sep=',', engine='python')

In [3]:
# Corpus without tags.
X = df.drop('medianHouseValue', axis=1).values
# Tags.
y = df['medianHouseValue'].values

In [4]:
# Generate Data Test and Data Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:
class validation_set:
    
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

In [6]:
validation_sets = []
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X_train):
    X_train_v, X_test_v = X_train[train_index], X_train[test_index]
    y_train_v, y_test_v = y_train[train_index], y_train[test_index]
    validation_sets.append(validation_set(X_train_v, y_train_v, X_test_v, y_test_v))

In [7]:
regression_type_list = list()
scale_type_list = list()
learning_rate_list = list()
eta0_list = list()
iterations_list = list()
mse_list = list()
r2_list = list()

In [8]:
def linear_regression(X_train, X_test, y_train, y_test, eta0 = 0.001, learning_rate = 'constant', iterations = 10000, standar_scale = False, robust_scale = False):
    if standar_scale:
        X_train = preprocessing.StandardScaler().fit_transform(X_train)
        X_test = preprocessing.StandardScaler().fit_transform(X_test)
    if robust_scale:
        X_train = preprocessing.RobustScaler().fit_transform(X_train)
        X_test = preprocessing.RobustScaler().fit_transform(X_test)
    regr = SGDRegressor(learning_rate = learning_rate, eta0 = eta0, max_iter = iterations)
    regr.fit(X_train, y_train)
    y_test_pred = regr.predict(X_test)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    return [mse, r2]

In [9]:
def polynomial_regression(X_train, X_test, y_train, y_test, degree = 2, eta0 = 0.001, learning_rate = 'constant', iterations = 10000, standar_scale = False, robust_scale = False):
    polynomial_features= PolynomialFeatures(degree = degree)
    X_train = polynomial_features.fit_transform(X_train)
    X_test = polynomial_features.fit_transform(X_test)
    if standar_scale:
        X_train = preprocessing.StandardScaler().fit_transform(X_train)
        X_test = preprocessing.StandardScaler().fit_transform(X_test)
    if robust_scale:
        X_train = preprocessing.RobustScaler().fit_transform(X_train)
        X_test = preprocessing.RobustScaler().fit_transform(X_test)
    regr = SGDRegressor(learning_rate = learning_rate, eta0 = eta0, max_iter = iterations)
    regr.fit(X_train, y_train)
    y_test_pred = regr.predict(X_test)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    return [mse, r2]

In [10]:
def get_best_params(validation_sets, eta0, learnin_rate, iterations, type_regression, scale_type):
    c = 1
    mses = list()
    r2s = list()
    n = len(validation_sets)
    for validation_set in validation_sets:
        if scale_type == 'none':
            if type_regression == 1:
                mse, r2 = linear_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, eta0, learning_rate, iterations)
            else:
                mse, r2 = polynomial_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, type_regression, eta0, learning_rate, iterations)
        elif scale_type == 'std':
            if type_regression == 1:
                mse, r2 = linear_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, eta0, learning_rate, iterations, standar_scale = True)
            else:
                mse, r2 = polynomial_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, type_regression, eta0, learning_rate, iterations, standar_scale = True)
        elif scale_type == 'robust':
            if type_regression == 1:
                mse, r2 = linear_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, eta0, learning_rate, iterations, robust_scale = True)
            else:
                mse, r2 = polynomial_regression(validation_set.X_train, validation_set.X_test, validation_set.y_train, validation_set.y_test, type_regression, eta0, learning_rate, iterations, robust_scale = True)
        print('kfold:', c)
        print('\tmse:', mse)
        print('\tr2:', r2)
        c = c + 1
        mses.append(mse)
        r2s.append(r2)
    mses_mean = sum(mses) / n
    r2s_mean = sum(r2s) / n
    return [mses_mean, r2s_mean]

In [11]:
scale_types = ['none', 'std', 'robust']
type_resgressions = [1, 2, 3]
# learning_rates = ['constant', 'optimal', 'invscaling', 'adaptive']
learning_rates = ['constant']
eta0s = [0.0001, 0.00001, 0.000001]
iterationss = [200000, 300000]

In [12]:
for scale_type in scale_types:
    for type_resgression in type_resgressions:
        for learning_rate in learning_rates:
            for eta0 in eta0s:
                for iterations in iterationss:
                    print(scale_type)
                    print('\t', type_resgression)
                    print('\t\t', learning_rate)
                    print('\t\t\t', eta0)
                    print('\t\t\t\t', iterations)
                    mses_mean, r2s_mean = get_best_params(validation_sets, eta0, learning_rate, iterations, type_resgression, scale_type)
                    scale_type_list.append(scale_type)
                    regression_type_list.append(type_resgression)
                    learning_rate_list.append(learning_rate)
                    eta0_list.append(eta0)
                    iterations_list.append(iterations)
                    mse_list.append(mses_mean)
                    r2_list.append(r2s_mean)
                    print("")
                    print("")

none
	 1
		 constant
			 0.0001
				 200000
kfold: 1
	mse: 1.26448511468954e+29
	r2: -9.10863677572615e+18
kfold: 2
	mse: 6.853580191300454e+31
	r2: -5.101895656879346e+21
kfold: 3
	mse: 2.87028523522579e+28
	r2: -2.1306155763851988e+18
kfold: 4
	mse: 3.57755621244611e+30
	r2: -2.6449477833506954e+20
kfold: 5
	mse: 1.3009611736625415e+30
	r2: -9.524769001795943e+19
kfold: 6
	mse: 4.4409790992183024e+29
	r2: -3.2387537262434914e+19
kfold: 7
	mse: 1.81298189888121e+29
	r2: -1.4212676390793576e+19
kfold: 8
	mse: 7.16354501556427e+29
	r2: -5.3422720104485945e+19
kfold: 9
	mse: 1.3192492383487133e+30
	r2: -1.0428306972719879e+20
kfold: 10
	mse: 4.177853318394394e+29
	r2: -3.1439768666361205e+19


none
	 1
		 constant
			 0.0001
				 300000
kfold: 1
	mse: 5.017728237295419e+28
	r2: -3.6144880965285335e+18
kfold: 2
	mse: 4.9115074438123434e+29
	r2: -3.6561910413077205e+19
kfold: 3
	mse: 3.835712132833516e+28
	r2: -2.847252919831119e+18
kfold: 4
	mse: 1.2607808309990276e+27
	r2: -9.32116580765

kfold: 9
	mse: 1.1337658564487115e+43
	r2: -8.962111208822707e+32
kfold: 10
	mse: 6.256665722457594e+40
	r2: -4.708353978602965e+30


none
	 3
		 constant
			 0.0001
				 200000
kfold: 1
	mse: 6.728412760133438e+64
	r2: -4.84676863311782e+54
kfold: 2
	mse: 2.9563793925381012e+63
	r2: -2.200767885086311e+53
kfold: 3
	mse: 1.0431650654723165e+64
	r2: -7.743424625397428e+53
kfold: 4
	mse: 5.127570290349755e+63
	r2: -3.790899392790395e+53
kfold: 5
	mse: 9.17781127826484e+62
	r2: -6.719380573168839e+52
kfold: 6
	mse: 1.9187865786394167e+63
	r2: -1.399348441547108e+53
kfold: 7
	mse: 4.9121284050890655e+64
	r2: -3.850810173815765e+54
kfold: 8
	mse: 1.8177527347109442e+65
	r2: -1.3556038993911088e+55
kfold: 9
	mse: 4.270783291911987e+66
	r2: -3.3759382145081056e+56
kfold: 10
	mse: 4.13251885863494e+62
	r2: -3.109861142152058e+52


none
	 3
		 constant
			 0.0001
				 300000
kfold: 1
	mse: 3.4142404454235865e+63
	r2: -2.459426032072544e+53
kfold: 2
	mse: 2.308785573898438e+62
	r2: -1.7186904892

kfold: 4
	mse: 4278867197.8159757
	r2: 0.6836561150110554
kfold: 5
	mse: 5152474952.928577
	r2: 0.6227701893975657
kfold: 6
	mse: 4342347833.408524
	r2: 0.6833176893678085
kfold: 7
	mse: 4171091029.1578307
	r2: 0.6730118098225608
kfold: 8
	mse: 4493117224.049153
	r2: 0.6649221253785681
kfold: 9
	mse: 4891755014.760931
	r2: 0.6133200501739853
kfold: 10
	mse: 3903875377.887666
	r2: 0.7062200861799017


std
	 2
		 constant
			 0.0001
				 300000
kfold: 1
	mse: 4562054090.455211
	r2: 0.6713753829251305
kfold: 2
	mse: 4223848417.6615086
	r2: 0.685571142441175
kfold: 3
	mse: 4687606652.016521
	r2: 0.6520384933810381
kfold: 4
	mse: 4319856021.245955
	r2: 0.6806257420068209
kfold: 5
	mse: 5225556652.790865
	r2: 0.6174196353338559
kfold: 6
	mse: 4340259645.625495
	r2: 0.6834699784421834
kfold: 7
	mse: 4156611236.0710397
	r2: 0.6741469376110689
kfold: 8
	mse: 4330169662.991474
	r2: 0.6770740723924829
kfold: 9
	mse: 4659130159.192905
	r2: 0.6317084132886341
kfold: 10
	mse: 3904421701.0743556
	r2:

kfold: 6
	mse: 4893216572.97392
	r2: 0.6431434813141716
kfold: 7
	mse: 4620248377.1308975
	r2: 0.6378006031402015
kfold: 8
	mse: 4635479244.0187
	r2: 0.6543053618490333
kfold: 9
	mse: 5435343039.105326
	r2: 0.5703508930217469
kfold: 10
	mse: 4438969890.416883
	r2: 0.6659524022607752


robust
	 1
		 constant
			 1e-05
				 200000
kfold: 1
	mse: 5146795123.577838
	r2: 0.6292539406344907
kfold: 2
	mse: 4626636986.284381
	r2: 0.6555870291523838
kfold: 3
	mse: 5039680168.219961
	r2: 0.6259040413603927
kfold: 4
	mse: 4621934595.0040655
	r2: 0.6582925624112176
kfold: 5
	mse: 5513668542.396482
	r2: 0.596326006632864
kfold: 6
	mse: 4894941426.9485235
	r2: 0.6430176897463065
kfold: 7
	mse: 4622403204.642378
	r2: 0.6376316777575624
kfold: 8
	mse: 4633917331.96367
	r2: 0.6544218427120172
kfold: 9
	mse: 5438153553.989827
	r2: 0.5701287294523847
kfold: 10
	mse: 4440502418.159876
	r2: 0.6658370743302748


robust
	 1
		 constant
			 1e-05
				 300000
kfold: 1
	mse: 5151337056.158055
	r2: 0.62892676545

kfold: 5
	mse: 2.7023726915615295e+21
	r2: -197849683486.64673
kfold: 6
	mse: 3.319821828150422e+23
	r2: -24211069397464.15
kfold: 7
	mse: 5.7001224355312675e+25
	r2: -4468549609574243.0
kfold: 8
	mse: 2.913670897548992e+25
	r2: -2172893790688143.8
kfold: 9
	mse: 1.1724781343656735e+25
	r2: -926812125301771.0
kfold: 10
	mse: 7.434830567419563e+23
	r2: -55949631377458.45


robust
	 3
		 constant
			 1e-05
				 300000
kfold: 1
	mse: 6.993297471918366e+22
	r2: -5037576622792.855
kfold: 2
	mse: 9.93690948084307e+22
	r2: -7397166722797.174
kfold: 3
	mse: 1.3235998088667305e+24
	r2: -98250945065048.84
kfold: 4
	mse: 1.878525806572309e+26
	r2: -1.388825883650696e+16
kfold: 5
	mse: 1.6492156092854248e+23
	r2: -12074455433881.496
kfold: 6
	mse: 2.0350394760459945e+23
	r2: -14841303097454.938
kfold: 7
	mse: 8.315582259814733e+23
	r2: -65189111779164.414
kfold: 8
	mse: 6.255107856972454e+23
	r2: -46647976042638.305
kfold: 9
	mse: 4.2320628564475317e+24
	r2: -334533076176882.8
kfold: 10
	mse: 6.40

In [13]:
data = {'scale type': scale_type_list,
        'regression type': regression_type_list,
        'learning rate': learning_rate_list,
        'eta0': eta0_list,
        'iterations': iterations_list,
        'mse':mse_list,
        'r2':r2_list}
df1 = pd.DataFrame(data)
df1

Unnamed: 0,scale type,regression type,learning rate,eta0,iterations,mse,r2
0,none,1,constant,0.0001,200000,7.664826e+30,-5.708623e+20
1,none,1,constant,0.0001,300000,6.706099999999999e+29,-4.985968e+19
2,none,1,constant,1e-05,200000,4.059777e+27,-2.997106e+17
3,none,1,constant,1e-05,300000,2.573872e+27,-1.954111e+17
4,none,1,constant,1e-06,200000,1.420439e+25,-1062243000000000.0
5,none,1,constant,1e-06,300000,2.428559e+26,-1.88102e+16
6,none,2,constant,0.0001,200000,1.215478e+48,-9.098222e+37
7,none,2,constant,0.0001,300000,1.548328e+48,-1.194883e+38
8,none,2,constant,1e-05,200000,4.383722e+44,-3.270517e+34
9,none,2,constant,1e-05,300000,3.410441e+46,-2.626169e+36


In [17]:
df1.to_csv('new_results.csv')

In [18]:
best = df1.sort_values('mse')

In [19]:
best

Unnamed: 0,scale type,regression type,learning rate,eta0,iterations,mse,r2
32,std,3,constant,1e-05,200000,4252283000.0,0.6819538
34,std,3,constant,1e-06,200000,4253181000.0,0.6818854
35,std,3,constant,1e-06,300000,4257287000.0,0.6815798
33,std,3,constant,1e-05,300000,4263377000.0,0.6810497
29,std,2,constant,1e-06,300000,4412961000.0,0.6700497
28,std,2,constant,1e-06,200000,4412972000.0,0.6700498
26,std,2,constant,1e-05,200000,4433456000.0,0.6685156
27,std,2,constant,1e-05,300000,4438348000.0,0.6681354
25,std,2,constant,0.0001,300000,4440951000.0,0.6679609
24,std,2,constant,0.0001,200000,4472362000.0,0.6654948


In [33]:
mse, r2 = polynomial_regression(X_train, X_test, y_train, y_test, degree = 3, eta0 = 0.00001, learning_rate = 'constant', iterations = 200000, standar_scale = True)

In [34]:
print('mse with X_train and X_test:', mse)
print('r2 with X_train and X_test:', r2)

mse with X_train and X_test: 4286305188.649957
r2 with X_train and X_test: 0.6712842460240827
