# Regresión desde el aprendizaje de máquinas

#### miércoles 03 de junio 2020

----

## Problema 1: Prepare el ambiente de trabajo

In [3]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from seaborn import load_dataset
df=pd.read_csv('boston.csv', index_col=0)
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


## Problema 2: División de la muestra

In [4]:
x_mat=df.drop(['medv'],axis=1)
y_vec=df['medv']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_mat, y_vec, test_size=.33, random_state=1024)

## Problema 3: Generación de modelos

In [6]:
mod_1=linear_model.LinearRegression(fit_intercept=True,normalize=True).fit(x_train, y_train) 
mod_2=linear_model.LinearRegression(fit_intercept=False,normalize=False).fit(x_train, y_train)

In [7]:
y_hat_1=mod_1.predict(x_test)
y_hat_2=mod_2.predict(x_test)

## Problema 4: Obtención de métricas

In [15]:
def report_scores(y_hat,y_test):
    mse = mean_squared_error(y_test,y_hat)
    r2 = r2_score(y_test,y_hat)
    
    print(f'Mean Square Error:\t{mse}\nR-cuadrado:\t\t{r2}\n')

In [16]:
report_scores(y_hat_1, y_test)
report_scores(y_hat_2, y_test)

Mean Square Error:	20.293293008919676
R-cuadrado:		0.7296200386561218

Mean Square Error:	22.754332649064672
R-cuadrado:		0.696830101484482



Se selecciona el primer modelo, ya que los datos presentan un menor error (MSE de 20.29 vs 22.74) y además la bondad de ajuste, explica más la varianza (0,72 vs. 0.69).

## Problema 5: Refactorización del modelo

In [17]:
def fetch_features(x_mat,y_vec='medv'):
    columns = df.columns
    Atributo, Pearson, Pearons_abs = [],[],[]

    for columna in columns:
        if columna != str(y_vec):
            Atributo.append(columna)
            Pearson.append(df[columna].corr(df[y_vec]))
            Pearons_abs.append(abs(df[columna].corr(df[y_vec])))

    features = pd.DataFrame({'attribute': Atributo,'corr':Pearson,'abs_corr':Pearons_abs})
    features = features.set_index('attribute')
    return(features.sort_values(by=['abs_corr'], ascending=False))

In [18]:
fetch_features(df)

Unnamed: 0_level_0,corr,abs_corr
attribute,Unnamed: 1_level_1,Unnamed: 2_level_1
lstat,-0.737663,0.737663
rm,0.69536,0.69536
ptratio,-0.507787,0.507787
indus,-0.483725,0.483725
tax,-0.468536,0.468536
nox,-0.427321,0.427321
crim,-0.388305,0.388305
rad,-0.381626,0.381626
age,-0.376955,0.376955
zn,0.360445,0.360445


In [19]:
df_corr = fetch_features(df)

In [20]:
df_absolut = np.absolute(df_corr['corr'])

In [21]:
df_absolut.head(6)

attribute
lstat      0.737663
rm         0.695360
ptratio    0.507787
indus      0.483725
tax        0.468536
nox        0.427321
Name: corr, dtype: float64

Los 6 atributos con mayor correlación al vetor medv son:

lstat      0.737663
rm         0.695360
ptratio    0.507787
indus      0.483725
tax        0.468536
nox        0.427321



## Problema 6: Refactorización del modelo predictivo

In [22]:
x_mat_2=df.loc[:,['lstat','rm','ptratio','indus','tax','nox']]

In [23]:
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_mat_2, y_vec, test_size=.33, random_state=1024)

In [24]:
mod_3=linear_model.LinearRegression(fit_intercept=True,normalize=True).fit(x_train_2, y_train_2)

In [25]:
y_hat_3=mod_3.predict(x_test_2)

In [26]:
report_scores(y_hat_3, y_test)

Mean Square Error:	25.32444625702328
R-cuadrado:		0.6625869050912803



## Problema 7: Predicción de casos

In [39]:
worst_neighbor = np.array([37.9, 12.6, 3.5, 27.7, 187, 0.87]).reshape(1,-1)
best_neighbor = np.array([1.73, 22, 8.7, 0.46, 711, 0.38]).reshape(1,-1)

In [40]:
df_worst=pd.DataFrame(data=worst_neighbor, columns=['lstat','rm','ptratio','indus','tax','nox'])
df_best=pd.DataFrame(data=best_neighbor, columns=['lstat','rm','ptratio','indus','tax','nox'])

In [41]:
y_hat_4=mod_3.predict(df_worst)
y_hat_5=mod_3.predict(df_best)

print(f'Valor esperado de worst_neighbor: {y_hat_4[0].round(3)}')
print(f'Valor esperado de best_neighbor: {y_hat_5[0].round(3)}')

Valor esperado de worst_neighbor: 52.374
Valor esperado de best_neighbor: 107.578
