![SolidQ](https://antoniosql.github.io/images/SolidQ_Verne.png) 
# RFM y Clustering

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:

import numpy as np 
import pandas as pd 
import time, warnings
import datetime as dt

#visualizaciones
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline
import seaborn as sns

warnings.filterwarnings("ignore")

In [None]:
#cargar el dataset
pedidos_df = pd.read_csv('/content/gdrive/My Drive/Datos/orders.csv')
pedidos_df.head()

Unnamed: 0,TransactionID,ProductID,CustomerID,Date,OrderTotal
0,10001,9255,1054,2014-12-31,49
1,10002,9253,1047,2015-03-29,171
2,10003,9260,1081,2014-07-06,48
3,10004,9253,1088,2015-01-22,59
4,10005,9251,1007,2015-01-18,94


In [None]:
pedidos_df['date'] = pd.DatetimeIndex(pedidos_df['Date']).date

## Recency

In [None]:
#Agrupamos por cliente y chequeamos la fecha de su última compra
recency_df = pedidos_df.groupby(by='CustomerID', as_index=False)['date'].max()
recency_df.columns = ['CustomerID','FechaUltimaCompra']
recency_df.head()

Unnamed: 0,CustomerID,FechaUltimaCompra
0,1002,2015-05-21
1,1003,2015-04-21
2,1004,2015-04-07
3,1005,2015-05-05
4,1006,2015-04-20


In [None]:
 fecha = recency_df['FechaUltimaCompra'].max()

In [None]:
#Cálculo de recency
recency_df['Recency'] = recency_df['FechaUltimaCompra'].apply(lambda x: (fecha - x).days)

In [None]:
recency_df.head()

Unnamed: 0,CustomerID,FechaUltimaCompra,Recency
0,1002,2015-05-21,7
1,1003,2015-04-21,37
2,1004,2015-04-07,51
3,1005,2015-05-05,23
4,1006,2015-04-20,38


## Frequency

In [None]:
# eliminamos duplicados 
copia_pedidos = pedidos_df
copia_pedidos.drop_duplicates(subset=['TransactionID', 'CustomerID'], keep="first", inplace=True)
#calculamos la frecuencia de las compras
frequency_df = copia_pedidos.groupby(by=['CustomerID'], as_index=False)['TransactionID'].count()
frequency_df.columns = ['CustomerID','Frequency']
frequency_df.head()

Unnamed: 0,CustomerID,Frequency
0,1002,9
1,1003,11
2,1004,9
3,1005,9
4,1006,10


## Monetary

In [None]:
monetary_df = pedidos_df.groupby(by='CustomerID',as_index=False).agg({'OrderTotal': 'sum'})
monetary_df.columns = ['CustomerID','Monetary']
monetary_df.head()

Unnamed: 0,CustomerID,Monetary
0,1002,1165
1,1003,1313
2,1004,1174
3,1005,1184
4,1006,1067


## Creamos tabla RFM

In [None]:
#hacemos un merge de los dataframes de recency y frequency
temp_df = recency_df.merge(frequency_df,on='CustomerID')
temp_df.head()

Unnamed: 0,CustomerID,FechaUltimaCompra,Recency,Frequency
0,1002,2015-05-21,7,9
1,1003,2015-04-21,37,11
2,1004,2015-04-07,51,9
3,1005,2015-05-05,23,9
4,1006,2015-04-20,38,10


In [None]:
#hacemos el merge con el dataframe de monetary para obtener una tabla con las tres columnas
rfm_df = temp_df.merge(monetary_df,on='CustomerID')
#usamos CustomerID como índice
rfm_df.set_index('CustomerID',inplace=True)

rfm_df.head()

Unnamed: 0_level_0,FechaUltimaCompra,Recency,Frequency,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1002,2015-05-21,7,9,1165
1003,2015-04-21,37,11,1313
1004,2015-04-07,51,9,1174
1005,2015-05-05,23,9,1184
1006,2015-04-20,38,10,1067


In [None]:
quantiles = rfm_df.quantile(q=[0.25,0.5,0.75])
quantiles

Unnamed: 0,Recency,Frequency,Monetary
0.25,6.0,9.0,1085.0
0.5,21.0,11.0,1337.0
0.75,41.0,14.0,1680.0


## Creación segmentos RFM

In [None]:
# Argumentos (x = value, p = recency, monetary_value, frequency, d = quartiles dict)
def RScore(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1
# Argumentos (x = value, p = recency, monetary_value, frequency, k = quartiles dict)
def FMScore(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

In [None]:
#creamos una tabla de segmentación por rfm
rfm_segmentation = rfm_df
rfm_segmentation['R'] = rfm_segmentation['Recency'].apply(RScore, args=('Recency',quantiles,))
rfm_segmentation['F'] = rfm_segmentation['Frequency'].apply(FMScore, args=('Frequency',quantiles,))
rfm_segmentation['M'] = rfm_segmentation['Monetary'].apply(FMScore, args=('Monetary',quantiles,))

In [None]:
rfm_segmentation.head()

Unnamed: 0_level_0,FechaUltimaCompra,Recency,Frequency,Monetary,R,F,M
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1002,2015-05-21,7,9,1165,3,1,2
1003,2015-04-21,37,11,1313,2,2,2
1004,2015-04-07,51,9,1174,1,1,2
1005,2015-05-05,23,9,1184,2,1,2
1006,2015-04-20,38,10,1067,2,2,1


## Agrupando los clientes

In [None]:
columnas_borrar = ['FechaUltimaCompra','Recency','Frequency','Monetary']

In [None]:
grupos = rfm_segmentation.drop(columnas_borrar,axis=1)
grupos.head()

Unnamed: 0_level_0,R,F,M
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1002,3,1,2
1003,2,2,2
1004,1,1,2
1005,2,1,2
1006,2,2,1


In [None]:
from sklearn.cluster import KMeans
import sklearn.metrics as sm
model = KMeans(n_clusters=2)
model.fit(grupos)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [None]:
grupos['Cluster'] = model.labels_

## Predicción de comportamiento

In [None]:
#cargar el dataset de clientes
clientes_df = pd.read_csv('/content/gdrive/My Drive/Datos/customers.csv')
clientes_df.head()

Unnamed: 0,CustomerID,Age,Sex,State,Education,MaritalStatus
0,1001,34,Male,Indiana,Bachelors Degree,Married
1,1002,44,Female,Pennsylvania,Masters Degree,Single
2,1003,45,Female,Florida,Bachelors Degree,Married
3,1004,44,Female,Georgia,Masters Degree,Divorced
4,1005,46,Female,Georgia,High School,Single


In [None]:
#merge con clientes

X = clientes_df.merge(grupos,on='CustomerID')
X.head()

Unnamed: 0,CustomerID,Age,Sex,State,Education,MaritalStatus,R,F,M,Cluster
0,1002,44,Female,Pennsylvania,Masters Degree,Single,3,1,2,1
1,1003,45,Female,Florida,Bachelors Degree,Married,2,2,2,1
2,1004,44,Female,Georgia,Masters Degree,Divorced,1,1,2,1
3,1005,46,Female,Georgia,High School,Single,2,1,2,1
4,1006,46,Male,Virginia,Bachelors Degree,Married,2,2,1,1


In [None]:
etiqueta = 'Cluster'
caracteristicas = ['Age','Sex','State','Education','MaritalStatus']

y= X[etiqueta]
X=X[caracteristicas]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


Hay que preparar los datos. Convertir las variables categóricas,  y escalar

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
#Vamos a definir un Pipeline que nos permita realizar las tareas de limpieza y transformación sobre varias columnas

#Seleccionamos las columnas numéricas
numeric_features = ['Age']
#Definimos las transformaciones que les vamos a aplicar --> Imputar nulos y escalar
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Definimos las variables categóricas y las trnasformaciones --> Imputar con 'missing' y aplicar una Codificación OneHot

categorical_features = ['Sex', 'State', 'Education','MaritalStatus']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Utilizamos el método ColumnTransformer para asociar los pipelines a las columnas

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Agregamos un clasificador de Regresión Logística
# Ahora tenemos el Pipeline completo

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])




clf.fit(X_train, y_train)

#OJO, aquí no estamos cambiando nuestro dataset original. Simplemente lanzamos este pipeline que realiza las transformaciones "al vuelo"

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [None]:

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.80
Accuracy of Logistic regression classifier on test set: 0.35


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

prediccion = clf.predict(X_test)
print(confusion_matrix(y_test, prediccion))

print(classification_report(y_test, prediccion))

[[3 8]
 [7 5]]
              precision    recall  f1-score   support

           0       0.30      0.27      0.29        11
           1       0.38      0.42      0.40        12

    accuracy                           0.35        23
   macro avg       0.34      0.34      0.34        23
weighted avg       0.34      0.35      0.35        23

