![SolidQ](https://antoniosql.github.io/images/SolidQ_Verne.png) 

# Carga de datos y exploración

In [33]:
import pandas as pd
import numpy as np
import sqlite3

In [34]:
conn = sqlite3.connect('Tiendas24H.sqlite')
consulta="select c.CodCliente,  EstadoCivil, Sexo, IngresosAnuales, N.NivelAcademico, O.Ocupacion, count(Ticket) as NumeroVentas from clientes c inner join NivelAcademico N    on c.NivelAcademico = N.Codigo    inner join Ocupacion O on c.Ocupacion = o.Codigo     left outer join VentasCAB v          on v.codcliente = C.codcliente  where c.Codcliente <>'CONTADO' group by c.CodCliente,  EstadoCivil, Sexo, IngresosAnuales, N.NivelAcademico, O.Ocupacion"

df = pd.read_sql_query(consulta,conn)

df.head()

Unnamed: 0,CodCliente,EstadoCivil,Sexo,IngresosAnuales,NivelAcademico,Ocupacion,NumeroVentas
0,20,M,M,90000.0,Licenciatura,Profesional especializado,1
1,26,S,M,60000.0,Licenciatura,Profesional especializado,22
2,29,M,M,60000.0,Licenciatura,Profesional especializado,1
3,46,S,F,70000.0,Licenciatura,Profesional especializado,0
4,47,S,F,80000.0,Licenciatura,Profesional especializado,18


In [35]:
caracteristicas = ["EstadoCivil", "Sexo", "IngresosAnuales","NivelAcademico", "Ocupacion"]
etiqueta="NumeroVentas"

In [36]:
from sklearn.preprocessing import  OneHotEncoder,LabelEncoder

le=LabelEncoder()
df['EstadoCivil'] = le.fit_transform(df['EstadoCivil'])
df['Sexo'] = le.fit_transform(df['Sexo'])

In [37]:
dum_df = pd.get_dummies(df[caracteristicas], columns=["NivelAcademico","Ocupacion"] )

# Generar datos de entrenamiento y test

In [46]:
from sklearn.model_selection import train_test_split
X, y = dum_df.values, df[etiqueta].values

In [47]:
X_train, X_test, y_train, y_test =\
  train_test_split(X, y,
  test_size=0.3)

## Escalado

In [None]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Regresión Lineal

In [48]:
from sklearn import linear_model
regressor = linear_model.LinearRegression()
lr = regressor.fit(X_train, y_train)

In [49]:
#Realizamos la predicción con test
print(lr.predict(X_test))

[ 7.42369237e+00  7.93280818e+00  9.55986239e+00  4.30723780e+00
  2.66399761e+00  1.20164964e+00  5.56583508e+00  6.87767299e+00
  4.87974340e-01  1.91084868e+01 -7.38034438e+00  1.51511855e+01
  1.64131986e+01  1.70870207e+01  1.91941177e+01  5.10663014e+00
  6.41442097e+00  2.26324935e+00  3.76457359e+00  1.45729301e+01
 -1.91480493e+00  1.45729301e+01  4.01164170e+00 -6.15755327e+00
  5.20425342e-01  4.47824889e+00  1.92792299e+01 -6.87122857e+00
 -1.10524495e+01  8.60663022e+00  4.47824889e+00  1.20164964e+00
  8.72133411e+00  8.34337614e+00  1.50655546e+01  4.01164170e+00
  4.30723780e+00  1.43917325e+01  3.13210489e-01  9.99412756e+00
  1.10754619e+01  1.43917325e+01 -1.85847704e-01 -2.58862698e+00
  3.38645664e+00  1.18343961e+01  9.32030552e+00  1.43917325e+01
  6.58516409e+00  3.17999215e-01  3.33781966e+00  4.01164170e+00
  7.67384459e-01  1.45729301e+01  1.58942730e+00  6.23965712e+00
 -5.48373123e+00  1.07428002e+01  9.99412756e+00  6.32191001e+00
  1.22449641e+01  3.50856

In [50]:
print(" lr.coef_:", lr.coef_) 
print(" lr.intercept_:", lr.intercept_)
#los guiones bajos representan siempre en sklearn valores derivados de datos de entrenamiento

lr.coef_: [ 8.11865588e+00  5.25262459e+00 -6.73822045e-05 -1.46963907e+00
 -1.93422258e-01 -3.31296331e+00  9.29785032e-01  4.04623960e+00
 -6.41893082e+00 -4.15050557e+00  1.24217863e+01  4.41395758e-01
 -2.29374567e+00]
 lr.intercept_: 1.430768588164061


In [51]:
#Mean Squared Error (MSE)
from sklearn.metrics import mean_squared_error
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_lr),mean_squared_error(y_test, y_test_pred_lr)))

# Si MSE es mayor en test que en train, clara muestra de overfitting

MSE train: 9432.307, test: 67.445


In [52]:
print('R-squared train score: {:.3f}'
     .format(lr.score(X_train, y_train)))
print('R-squared test score: {:.3f}'
     .format(lr.score(X_test, y_test)))
#R-squared conocido también como Coeficiente de determinación. El coeficiente determina la calidad del modelo para replicar los resultados, 
#y la proporción de variación de los resultados que puede explicarse por el modelo
#Es el porcentaje de variacion de la variable dependiente que explica el modelo lineal



R-squared train score: 0.005
R-squared test score: -4.330


In [61]:
import matplotlib.pyplot as plt

# Creamos el fit lineal utilizando el conjunto de entrenamiento
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, lr.predict(X_train), color = 'blue')

plt.show()

ValueError: x and y must be the same size

In [54]:
# Aplicamos el fit lineal al conjunto de prueba
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, regressor.predict(X_test), color = 'blue')
plt.title('Total Rooms vs Median House Value (Test set)')
plt.xlabel('total_rooms')
plt.ylabel('median_house_value')
plt.show()

ValueError: x and y must be the same size

# k-NN Regresor

In [55]:
from sklearn.neighbors import KNeighborsRegressor

#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 0)

knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)

In [56]:
#realizamos la predicción

print(knnreg.predict(X_test))

[6.400e+00 1.000e+00 1.400e+00 1.000e+00 1.800e+00 8.000e-01 2.200e+00
 5.600e+00 1.200e+00 1.800e+00 2.400e+00 1.000e+00 2.200e+00 8.800e+00
 1.800e+00 1.000e+00 1.000e+00 1.400e+00 1.000e+00 7.000e+00 1.800e+00
 7.000e+00 3.400e+00 1.000e+00 4.400e+00 1.200e+00 1.000e+00 1.000e+00
 3.200e+00 1.200e+00 1.200e+00 8.000e-01 1.800e+00 8.000e-01 2.400e+00
 3.400e+00 1.000e+00 1.800e+00 1.200e+00 4.800e+00 1.200e+00 1.800e+00
 1.000e+00 9.000e+00 4.400e+00 2.400e+00 1.400e+00 1.800e+00 1.000e+00
 1.000e+00 2.400e+00 3.400e+00 1.200e+00 7.000e+00 1.600e+00 6.000e+00
 1.000e+00 8.000e-01 4.800e+00 1.000e+00 2.800e+00 2.600e+00 1.000e+00
 1.800e+00 8.000e-01 1.000e+00 1.400e+00 2.600e+00 1.400e+00 2.000e+00
 7.800e+00 1.200e+00 4.400e+00 1.200e+00 1.000e+00 1.000e+00 3.800e+00
 1.200e+00 1.000e+00 1.200e+00 4.400e+00 6.400e+00 1.000e+00 2.200e+00
 2.600e+00 4.800e+00 1.000e+00 1.600e+00 1.000e+00 1.200e+00 1.800e+00
 3.200e+00 3.400e+00 1.000e+00 2.800e+00 5.400e+00 4.400e+00 1.600e+00
 1.600

In [57]:
#Mean Squared Error (MSE)

y_train_pred_knn = knnreg.predict(X_train)
y_test_pred_knn = knnreg.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_knn),mean_squared_error(y_test, y_test_pred_knn)))

MSE train: 6461.823, test: 2673.457


In [58]:
print('R-squared test score: {:.3f}'
     .format(knnreg.score(X_test, y_test)))

R-squared test score: -210.286


In [59]:
fig, subaxes = plt.subplots(1, 2, figsize=(8,4))
X_predict_input = np.linspace(X.min(), X.max(), 50).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X[0::5], y[0::5], random_state = 0)

for thisaxis, K in zip(subaxes, [1, 3]):
    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    #thisaxis.set_xlim([-2.5, 0.75])
    thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10,
                 label='Predicted', alpha=0.8)
    thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.4)
    thisaxis.set_xlabel('Input feature')
    thisaxis.set_ylabel('Target value')
    thisaxis.set_title('KNN regression (K={})'.format(K))
    thisaxis.legend()
plt.tight_layout()

ValueError: query data dimension must match training data dimension

In [0]:
# plot k-NN regression para diferentes valores de k

fig, subaxes = plt.subplots(5, 1, figsize=(5,20))
X_predict_input = np.linspace(X.min(), X.max(), 500).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 0)

for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]):
    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    train_score = knnreg.score(X_train, y_train)
    test_score = knnreg.score(X_test, y_test)
    
    thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
    thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')
    
    thisaxis.plot(X_predict_input, y_predict_output)
    
    thisaxis.set_xlabel('Input feature')
    thisaxis.set_ylabel('Target value')
    thisaxis.set_title('KNN Regression (K={})\n\
Train $R^2 = {:.3f}$,  Test $R^2 = {:.3f}$'
                      .format(K, train_score, test_score))
    thisaxis.legend()
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)