![SolidQ](https://antoniosql.github.io/images/SolidQ_Verne.png) 

# Selección de Características

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt

In [2]:
conn = sqlite3.connect('Tiendas24H.sqlite')
#Consideramos que un cliente nos ha abandonado si no nos compra desde hace más de 200 días. 
consulta = "select c.CodCliente, max(v.fecha) as FechaUltimaCompra, EstadoCivil, Sexo,IngresosAnuales, N.NivelAcademico, O.Ocupacion,  round((julianday('now') - julianday(FechaNacimiento))/365) as Edad,round((julianday('now') - julianday(max(v.fecha)))) as DiasUltimaCompra, case when round((julianday('now') - julianday(max(v.fecha))))>200 then 1 else 0 end as 'Abandono' from clientes c inner join NivelAcademico N  on c.NivelAcademico = N.Codigo  inner join Ocupacion O on c.Ocupacion = o.Codigo inner join VentasCAB v on v.codcliente = C.codcliente where c.Codcliente <>'CONTADO' group by c.CodCliente,  EstadoCivil, Sexo, IngresosAnuales, N.NivelAcademico, O.Ocupacion"

df = pd.read_sql_query(consulta,conn)

In [3]:
df.dropna(axis=0,inplace=True)

In [4]:
#Personalizado
etiquetas = ["Adolescentes", "Millenials" ,"EGB","Maduros"]
custombins = [16,29,39,54,88]
df["RangoEdad"] = pd.cut(df["Edad"],bins=custombins,labels=etiquetas)

In [5]:
caracteristicas=["EstadoCivil" , "Sexo","IngresosAnuales","NivelAcademico","Ocupacion","DiasUltimaCompra","RangoEdad"]
etiqueta = ["Abandono"]

In [6]:
from sklearn.preprocessing import  LabelEncoder

le=LabelEncoder()
df['EstadoCivil'] = le.fit_transform(df['EstadoCivil'])
df['Sexo'] = le.fit_transform(df['Sexo'])

In [7]:
dum_df = pd.get_dummies(df[caracteristicas], columns=["NivelAcademico","Ocupacion","RangoEdad"] )


In [8]:
X, y = dum_df, df[etiqueta]

In [9]:
from imblearn.over_sampling import SMOTE 
from collections import Counter

print('Distribución Original %s' % Counter(y))
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Distribución balanceada %s' % Counter(y_res))

Distribución Original Counter({'Abandono': 1})
Distribución balanceada Counter({'Abandono': 1})


In [10]:
#Selección Univariate
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X_res, y_res)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X_res)
# summarize selected features
print(features[0:5,:])


[2.574e+01 8.710e+01 4.441e+03 2.289e+05 6.234e+01 4.720e+01 1.502e+02
 4.211e+01 9.408e+01 4.676e+00 1.744e+00 6.834e+01 4.771e+01 4.934e+01
       nan 1.423e+01 1.912e+02 1.034e+02]
[[9.00e+04 2.31e+02 0.00e+00 1.00e+00]
 [6.00e+04 1.55e+02 0.00e+00 1.00e+00]
 [6.00e+04 2.96e+02 0.00e+00 1.00e+00]
 [8.00e+04 1.60e+02 0.00e+00 1.00e+00]
 [7.00e+04 3.40e+02 0.00e+00 1.00e+00]]


In [15]:
features

array([[9.000e+04, 2.310e+02, 0.000e+00, 1.000e+00],
       [6.000e+04, 1.550e+02, 0.000e+00, 1.000e+00],
       [6.000e+04, 2.960e+02, 0.000e+00, 1.000e+00],
       ...,
       [6.000e+04, 1.663e+02, 0.000e+00, 0.000e+00],
       [4.000e+04, 1.476e+02, 0.000e+00, 0.000e+00],
       [7.000e+04, 1.780e+02, 0.000e+00, 0.000e+00]])

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, y_res, random_state=1 , test_size=0.3)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 1.00


In [18]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE

rfe = RFE(clf, 3)
rfe = rfe.fit(X_train, y_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True]
[2 1 1 1]
