![Verne](https://www.vernegroup.com/wp-content/uploads/2020/07/LOGO-VERNE-TECHNOLOGY-GROUP-3.png)

# Carga de datos y exploración

In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
conn = sqlite3.connect('Tiendas24H.sqlite')
consulta="select c.CodCliente,  EstadoCivil, Sexo, IngresosAnuales, N.NivelAcademico, O.Ocupacion, count(Ticket) as NumeroVentas from clientes c inner join NivelAcademico N    on c.NivelAcademico = N.Codigo    inner join Ocupacion O on c.Ocupacion = o.Codigo     left outer join VentasCAB v          on v.codcliente = C.codcliente  where c.Codcliente <>'CONTADO' group by c.CodCliente,  EstadoCivil, Sexo, IngresosAnuales, N.NivelAcademico, O.Ocupacion"

df = pd.read_sql_query(consulta,conn)

df.head()

Unnamed: 0,CodCliente,EstadoCivil,Sexo,IngresosAnuales,NivelAcademico,Ocupacion,NumeroVentas
0,20,M,M,90000.0,Licenciatura,Profesional especializado,1
1,26,S,M,60000.0,Licenciatura,Profesional especializado,22
2,29,M,M,60000.0,Licenciatura,Profesional especializado,1
3,46,S,F,70000.0,Licenciatura,Profesional especializado,0
4,47,S,F,80000.0,Licenciatura,Profesional especializado,18


In [3]:
caracteristicas = ["EstadoCivil", "Sexo", "IngresosAnuales","NivelAcademico", "Ocupacion"]
etiqueta="NumeroVentas"

In [4]:
from sklearn.preprocessing import  OneHotEncoder,LabelEncoder

le=LabelEncoder()
df['EstadoCivil'] = le.fit_transform(df['EstadoCivil'])
df['Sexo'] = le.fit_transform(df['Sexo'])

In [5]:
dum_df = pd.get_dummies(df[caracteristicas], columns=["NivelAcademico","Ocupacion"] )

# Generar datos de entrenamiento y test

In [6]:
from sklearn.model_selection import train_test_split
X, y = dum_df.values, df[etiqueta].values

In [7]:
X_train, X_test, y_train, y_test =\
  train_test_split(X, y,
  test_size=0.3)

## Escalado

In [8]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Regresión Lineal

In [9]:
from sklearn import linear_model
regressor = linear_model.LinearRegression()
lr = regressor.fit(X_train, y_train)

In [10]:
#Realizamos la predicción con test
print(lr.predict(X_test))

[2.15625 3.40625 1.90625 3.46875 1.78125 2.28125 3.375   1.5     1.8125
 1.59375 2.96875 2.15625 1.78125 2.78125 2.28125 3.59375 3.25    2.78125
 2.53125 3.28125 2.03125 2.03125 4.34375 1.78125 1.65625 2.78125 2.78125
 2.25    2.9375  3.96875 1.90625 3.90625 3.59375 3.1875  1.375   1.90625
 2.625   2.34375 2.03125 3.46875 1.28125 1.96875 1.875   3.5     2.90625
 2.15625 2.875   2.65625 3.78125 2.6875  4.      2.53125 1.84375 2.40625
 3.90625 1.3125  3.6875  3.03125 2.34375 3.71875 2.75    4.      2.15625
 2.25    3.34375 2.15625 2.      2.25    2.      1.78125 1.40625 2.9375
 4.125   3.84375 2.15625 4.125   4.21875 2.53125 3.28125 1.625   2.53125
 4.21875 1.40625 2.78125 2.59375 1.96875 4.21875 2.90625 2.28125 3.34375
 1.8125  1.90625 2.96875 2.875   2.59375 2.28125 2.1875  1.34375 3.15625
 3.59375 2.75    2.53125 2.6875  2.1875  3.25    2.6875  4.03125 1.28125
 3.53125 2.28125 2.125   4.65625 3.5     3.6875  2.40625 2.875   3.46875
 2.46875 1.09375 2.875   3.8125  4.21875 2.03125 2.75

In [11]:
print(" lr.coef_:", lr.coef_) 
print(" lr.intercept_:", lr.intercept_)
#los guiones bajos representan siempre en sklearn valores derivados de datos de entrenamiento

lr.coef_: [ 5.15200566e-01  9.56714465e-02  1.75218802e+00 -1.23879653e+14
 -1.23879653e+14 -1.23879653e+14 -1.23879653e+14 -1.23879653e+14
 -1.31875283e+14 -1.31875283e+14 -1.31875283e+14 -1.31875283e+14
 -1.31875283e+14]
 lr.intercept_: 255754935879051.47


In [12]:
#Mean Squared Error (MSE)
from sklearn.metrics import mean_squared_error
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_lr),mean_squared_error(y_test, y_test_pred_lr)))

# Si MSE es mayor en test que en train, clara muestra de overfitting

MSE train: 55.771, test: 22017.284


In [13]:
print('R-squared train score: {:.3f}'
     .format(lr.score(X_train, y_train)))
print('R-squared test score: {:.3f}'
     .format(lr.score(X_test, y_test)))
#R-squared conocido también como Coeficiente de determinación. El coeficiente determina la calidad del modelo para replicar los resultados, 
#y la proporción de variación de los resultados que puede explicarse por el modelo
#Es el porcentaje de variacion de la variable dependiente que explica el modelo lineal



R-squared train score: 0.008
R-squared test score: -0.001


# k-NN Regresor

In [14]:
from sklearn.neighbors import KNeighborsRegressor

#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 0)

knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)

In [15]:
#realizamos la predicción

print(knnreg.predict(X_test))

[ 1.   2.6  1.6  4.   1.   2.8  1.8  3.   2.   1.   2.4  1.   1.8  1.2
  1.   1.   2.6  1.2  2.   1.6  2.4  1.4  1.6  6.6  1.8  1.8  1.4  1.4
  1.2  3.4  1.2  1.4  1.2  1.6  1.6  1.4  2.4  1.6  1.   1.2  4.   1.2
  2.   1.   4.8  1.   3.6  1.4  1.2  1.6  2.2  2.4  1.2  1.   1.8  1.2
  3.6  3.6  4.8  1.2  5.   2.2  1.4  1.2  2.2  1.  13.   1.2 13.   1.
  2.   1.2  1.8  1.6  1.4  1.8  1.2  2.   1.6  1.   2.4  1.2  4.   1.2
  1.4  1.2  1.2  4.6  1.6  8.2  2.   1.8 10.   1.2  1.4  1.   1.2  7.
  1.8  1.   5.   2.4  1.6  1.6  2.6  1.   1.   1.4  2.   1.   1.   3.4
  1.   2.8  1.   1.2  1.2  4.2  2.4  3.6  0.8  1.2  4.4  1.6  1.   3.4
  3.4  4.8  2.2  3.6  1.8  1.   1.6  2.   1.6  1.   3.8  1.   8.2  2.
  6.6  1.4  1.   3.4  1.   2.6  1.2  1.6  3.4  2.2  1.6  1.6  1.8  1.6
  1.   1.4  1.8  1.   1.   1.6  1.4  1.6  1.2  5.6  1.6  3.2  1.2  2.6
  1.2  2.6  2.6  1.6  3.4  1.6  1.4  3.6  9.4  1.4  1.8  0.8  1.6  1.6
  2.4  2.6  1.6  0.8  3.2  1.2  1.6  1.   9.4  5.4  2.4  2.6  1.4  3.
  1.2  1.2

In [16]:
#Mean Squared Error (MSE)

y_train_pred_knn = knnreg.predict(X_train)
y_test_pred_knn = knnreg.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred_knn),mean_squared_error(y_test, y_test_pred_knn)))

MSE train: 53.216, test: 22040.947


In [17]:
print('R-squared test score: {:.3f}'
     .format(knnreg.score(X_test, y_test)))

R-squared test score: -0.002
