### <font color="#004D7F"> Housing: Obtener datos</font>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
housing = pd.read_csv('../1-ScikitLearn/USA_Housing.csv')

In [None]:
print(housing.shape)
housing.head(5)
housing.info()
housing.describe()

In [None]:
# Analysis on Each Pair of Features
# sns.pairplot(housing)
sns.distplot(housing['Price'])

In [None]:
sns.jointplot(x=housing['Price'], y=housing['Avg. Area Income'], kind='kde');

In [None]:
sns.heatmap(housing.corr(), annot=True)

### <font color="#004D7F"> Housing: Entrenar el modelo</font>

Para ello dividiremos el conjunto en datos de prueba `test` y datos de entrenamiento `fit`

El eje X contendrá todas las columnas características y el eje Y contendrá el target de salida.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X = housing[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]
Y = housing['Price']

In [None]:
# crear las variables para dividir en datos de entrenamiento y prueba
# Y_test nos servira para evaluar el model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
print("datos de entrenamiento caracteristicas, eje X\n", X_train.head(3))
print("datos de test caracteristicas, eje X\n", X_test.head(3))
print("datos de entrenamiento target, eje Y\n", Y_train.head(3))
print("\ndatos de test, eje Y\n", Y_test.head(3))

In [None]:
# Entrenar el modelo mediante el metodo fit usando una variable que sera una instancia de 
lrm = LinearRegression()
lrm.fit(X_train, Y_train)

### <font color="#004D7F"> Housing: Evaluar el modelo</font>

Usamos el metodo `predict`

In [None]:
from sklearn import metrics
# pasamos como parámetros las caracteristicas de pruebas y genera las predicciones
housing_pred = lrm.predict(X_test)

Los datos predecidos deben ser comparados con el precio real para disponer de una aproximación al modelo o si por el contrario hemos de reajustar.

* Calcular la diferencia entre los precios `Y_test` menos `housing_pred`

Hay que tener en cuenta que `housing_predict`es una lista y `Y_test` es un NDArray

In [None]:
print("Predicción sobre los datos de test , eje X\n", housing_pred[0:3])
housing_pred.dtype

In [None]:
# La diferecia entre los precios es el error: Y_test - housing_pred
print("\n datos de test, eje Y \n", Y_test.head(3))
Y_test.dtype

In [None]:
# Gráfico para ver las diferencias de tipo scatter
plt.scatter(Y_test, housing_pred)

In [None]:
sns.jointplot(x=Y_test, y=housing_pred, kind='kde');

In [None]:
sns.distplot(Y_test - housing_pred)
# El error se acerca mucho a la distribución normal.

In [None]:
# Metricas para evaluar el modelo
#1. MAE -> Mean absolute error, la media del valor absoluto de los errores
# 2. MSE -> media de los errores al cuadrado
# 3. RMSE -> raiz cuadrada de los errores al cuadrado
metrics.mean_absolute_error(Y_test, housing_pred) # cuanto más bajo sea el valor, mejor es el modelo
print("Mean absolute error: %.1f" % metrics.mean_absolute_error(Y_test, housing_pred))

In [None]:
metrics.mean_squared_error(Y_test, housing_pred) 

In [None]:
np.sqrt(metrics.mean_squared_error(Y_test, housing_pred))

In [None]:
# Mean absolute error:
print("Mean absolute error: %.1f" % metrics.mean_absolute_error(Y_test, housing_pred))

# R2 score:
print('R2 Score: %.2f' % metrics.r2_score(Y_test, housing_pred))

# Mean squared error:
print('Mean squared error: %.2f' % metrics.mean_squared_error(Y_test, housing_pred))

In [None]:
print(Y_test[0:5])

In [None]:
#Compare actual value and predicted value:

Y_test['Pred'] = housing_pred
Y_test['Difference'] = abs(Y_test['Pred'] - Y_test)
Y_test.head()
#print(Y_test['Pred'])

## <font color="#004D7F">Ejercicio Boston</font>

La base de datos boston es un dicionario que contiene gran cantidad de datos relacionados con la ciudad

In [None]:
#Import libraries
import numpy
import pandas as pd
import sklearn
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 10, 8

from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from sklearn.cross_validation import KFold
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from scipy.interpolate import spline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge

In [None]:
boston = load_boston()
print(boston.data.shape)
boston.keys()

In [None]:
# Get the attributes or features of the data

boston.feature_names

In [None]:
#bostondf = pd.DataFrame(boston.data, columns=boston.feature_names)

#bostondf.head(4)
#bostondf.shape

boston_data = pd.DataFrame(boston['data'], columns=boston.feature_names)
boston_target = pd.DataFrame(boston['target'], columns={'Target'})
print(boston_data.head(2))
print(boston_target.head(2))

In [None]:
# creamos un objeto con los parámetros por defecto
lr = LinearRegression()

# entrenamos con los datos de entrada y la salida
lr.fit(boston_data,boston_target)

# obtenemos una predicción para los datos de pima
boston_prediction = lr.predict(boston_data)

# comparamos estos datos con el error cuadrático medio
from sklearn.metrics import mean_squared_error
print('Error cuadrático medio:')
print(mean_squared_error(boston_target, boston_prediction))

# y obtenemos directamente el score
print('Coeficiente R2 de la función score:')
print(lr.score(boston_data,boston_target))

In [None]:
# ¡Atención! Solo necesario si se ejecuta el renglon que construye bostondf
# First, check for missing information.
# bostondf.isnull().sum()

# Separar las variables dependent and independent variables
# boston_data_X = bostondf[bostondf.columns[0:13]]
# boston_data_Y = bostondf[bostondf.columns[13:14]]

In [None]:
#Check the correlation
boston_data.corr()
#Visualize correlation between attributes by using heatmap
seaborn.heatmap(boston_data.corr())

In [None]:
# Feature contains high correlation. We need to remove them first before applying regression techniques.
# Create correlation matrix
abs_corr_matrix = boston_data.corr().abs()

In [None]:
# Select upper triangle of matrix
up_tri = abs_corr_matrix.where(numpy.triu(numpy.ones(abs_corr_matrix.shape), k=1).astype(numpy.bool))

# Find all the features which is having correlation > 0.75 with other features.
correlated_features = [column for column in up_tri.columns if any(up_tri[column] > 0.75)]

#Print correlated_features
print(correlated_features)

In [None]:
#Drop correlated features:

boston_data = boston_data.drop(correlated_features, axis=1)

In [None]:
#Divide the data into training and test set. Train set contains 80% of the data. Test set contains 20% of the data.

X_train, X_test, Y_train, Y_test = train_test_split(boston_data, boston_target, test_size=0.20)

#Create object of multiple linear regression:
linear_regression = LinearRegression()

# Fit the model:
linear_regression.fit(X_train, Y_train)

# Make prediction on test data:
Y_pred = linear_regression.predict(X_test)


In [None]:
# Errores resultado

# Mean squared error:
print("Mean squared error: %.1f" % mean_squared_error(Y_test,Y_pred))

# R2 score:
print('R2 Score: %.2f' % r2_score(Y_test,Y_pred))

# Mean absolute error:
print('Mean absolute error: %.2f' % mean_absolute_error(Y_test,Y_pred))


In [None]:
# Compare actual value and predicted value:

Y_test['Predecido'] = Y_pred
Y_test['Diferencia'] = abs(Y_test['Predecido'] - Y_test['Target'])
Y_test.head()

In [None]:
# df = pd.DataFrame({'Actual': Y_test, 'Predecido': Y_pred})
# df.head(5)

# df2 = pd.DataFrame({'Actual': Y_test, 'Predecido': Y_pred, 'Diferencia':abs(Y_test - Y_pred)})
# df2.head(5)

# ValueError: If using all scalar values, you must pass an index