In [None]:
from sqlalchemy import create_engine

import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn import cluster
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import feature_selection as fs
from sklearn import preprocessing as prepro

import matplotlib.pyplot as plt


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
engine = create_engine("postgres://postgres:postgres@localhost/ecommerce")

In [None]:

class EstimatorSelectionHelper:

    def __init__(self, models, params, scoring=['accuracy'], cv=5):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
        self.scoring = scoring
        self.cv = cv

    def fit(self, X, y, n_jobs=3, verbose=1, refit=False, cv=5):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=self.cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=self.scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs

    def score_summary(self):
        def row(key, scores, params):
            d = {}
            d['estimator'] = key
            for score, values in scores.items():
                d['min_score_{}'.format(score)] = min(values)
                d['max_score_{}'.format(score)] = max(values)
                d['mean_score_{}'.format(score)] = np.mean(values)
                d['std_score_{}'.format(score)] = np.std(values)

            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores_list = {}
            for p_i, val in enumerate(params, start=0):
                #print("PARAMS {} at index {}".format(str(val), p_i))
                scores_list[str(val)] = {}
                for score in self.scoring:
                    #print("SCORE: {}".format(score))
                    scores_list[str(val)][score] = []
                    for i in range(0, self.cv):
                        key = "split{}_test_{}".format(i, score)
                        #print("KEY {}".format(key))
                        r = self.grid_searches[k].cv_results_[key]
                        #print("VAL {} at index {}".format(r, p_i))
                        scores_list[str(val)][score].append(r[p_i])

            for param in params:
                rows.append((row(k, scores_list[str(param)], param)))
        df = pd.DataFrame(rows)

        score_cols = []
        for score in self.scoring:
            score_cols += ['min_score_{}'.format(score), 'mean_score_{}'.format(score), 'max_score_{}'.format(score), 'std_score_{}'.format(score)]
        columns = ['estimator'] + score_cols
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

    @staticmethod
    def search_plot(df, score, x, y, ax=None):
        # TODO:  An automatic search given the whole score_summary should be implemented.
        df = df.pivot(index=y, columns=x, values=score)
        return sns.heatmap(df, ax=ax)

In [None]:
class PCAw(PCA):

    def __init__(self, *args, **kwargs):
        PCA.__init__(self, *args, **kwargs)

    def variance_plot(self, ax=None):
        """
        Method that given a PCA instance
        returns a plot with the relative
        and accumulative variance.
        """
        pc_variance_df = pd.DataFrame({
            'accumulative_variance': np.cumsum(self.explained_variance_ratio_),
            'relative_variance': self.explained_variance_ratio_
            })

        plot = pc_variance_df.plot(kind='bar', ax=ax)
        plot.axhline(y=0.95, color='r', linestyle='--')
        plot.set_title("PCA Explained variance")
        plot.set_xlabel("Principal Components")
        plot.set_ylabel("Variance")

        return plot

    def plot_contribution(self, index, columns, ax=None):
        eigenvalues=self.components_
        pc=abs(eigenvalues[index,:])
        contributions = pd.DataFrame({'contribution': pc}, index=columns)
        ax = contributions.sort_values(by='contribution', ascending=False).plot(kind='bar', title="Contribution of variables to DIM {}".format(index), ax=ax)
        ax.axhline(y=0.1, color='r', linestyle='--')
        return ax

def plot_silhouette_method(df, k_min=2, k_max=10, ax=None):
    silhouette = []
    for k in range(k_min, k_max):
        kmeans = cluster.KMeans(n_clusters=k)
        clusters = kmeans.fit_predict(df)
        silhouette.append(silhouette_score(df, clusters))
    if ax is not None:
        return ax.plot(range(k_min,k_max), silhouette, marker='o')
    else:
        return plt.plot(range(k_min,k_max), silhouette, marker='o')

def plot_elbow_method(df, k_min=1, k_max=10, ax=None):
    sse = []
    # Apliquem KMeans pel rang de k especificat
    for k in range(k_min, k_max):
        kmeans = cluster.KMeans(n_clusters=k)
        # Afegim les dades
        kmeans.fit(df)
        # Obtenim SSE
        sse.append(kmeans.inertia_)

    if ax is not None:
        return ax.plot(range(k_min, k_max), sse, marker='o')
    else:
        return plt.plot(range(k_min, k_max), sse, marker='o')

In [None]:

class EcommerceWraper:
    numeric_cols = ["ipcasos", "total_consumos", "visit_days", "vists_per_day", "fichas_basicas", "perfil_promocional", "same_section", "same_group", "same_class"]
    
    def __init__(self, df_usuarios):
        self.df_usuarios = df_usuarios
    
    def get_target_variable(self,):
        return self.df_usuarios.ind_cliente.values
    
    def train_test_split(self, df, test_size=0.3):
        
        #true_labels = df.loc[self.df_usuarios.ind_cliente == 1, :]
        #true_target = self.df_usuarios.loc[self.df_usuarios.ind_cliente == 1, ["ind_cliente"]]
        #false_labels = df.loc[self.df_usuarios.ind_cliente == 0, :]
        #false_target = self.df_usuarios.loc[self.df_usuarios.ind_cliente == 0, ["ind_cliente"]]
        
        #X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(true_labels, true_target, test_size=test_size)
        #X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(false_labels, false_targets, test_size=test_size)
        target = self.get_target_variable()
        return train_test_split(df, target, test_size=test_size, stratify=target)
    
    def _get_dummies(self, col_name):
        return pd.get_dummies(self.df_usuarios[[col_name]].astype(str), prefix=col_name)
    
    def _get_scaled(self, col_name):
        scaled = prepro.MinMaxScaler().fit_transform(self.df_usuarios[col_name].values.reshape(-1, 1))
        return pd.DataFrame(scaled, columns=[col_name])
    
    def dummy_encode(self, scale_numeric_cols=True):
        if scale_numeric_cols:
            test_cols = ["ipcasos", "total_consumos", "visit_days", "fichas_basicas", "perfil_promocional", "same_section", "same_division", "same_group", "same_class"]
            self.df_usuarios[test_cols] = self.df_usuarios[test_cols].clip(upper=self.df_usuarios[test_cols].quantile(0.95), axis=1)
            
            ipcasos = self._get_scaled("ipcasos")
            total_consumos = self._get_scaled("total_consumos")
            visit_days = self._get_scaled("visit_days")
            visits_per_day = self._get_scaled("visits_per_day")
            fichas_basicas = self._get_scaled("fichas_basicas")
            perfil_promocional = self._get_scaled("perfil_promocional")
            same_section = self._get_scaled("same_section")
            same_division = self._get_scaled("same_division")
            same_group = self._get_scaled("same_group")
            same_class = self._get_scaled("same_class")
            #print(len(same_class.index))
        else:
            ipcasos = self.df_usuarios["ipcasos"]
            total_consumos = self.df_usuarios["total_consumos"]
            visit_days = self.df_usuarios["visit_days"]
            visits_per_day = self.df_usuarios["visits_per_day"]
            fichas_basicas = self.df_usuarios["fichas_basicas"]
            perfil_promocional = self.df_usuarios["perfil_promocional"]
            same_section = self.df_usuarios["same_section"]
            same_division = self.df_usuarios["same_division"]
            same_group = self.df_usuarios["same_group"]
            same_class = self.df_usuarios["same_class"]
        tipousuario = self._get_dummies("tipousuario").astype(int)
        canal_registro = self._get_dummies("canal_registro").astype(int)
        #ind_cliente = self.df_usuarios["ind_cliente"]
        #ind_alta = self.df_usuarios["ind_alta"].astype(int)
        #print(len(ind_alta.index))
        #print("--")
        tipoemail = self._get_dummies("tipoemail").astype(int)
        # We will interpret this variable as ordinal categorical.
        # TODO: Maybe normalize?
        bonad_email = self._get_dummies("bonad_email").astype(int)
        
        ip_country = self._get_dummies("ip_country").astype(int)
        #print(len(ip_country.index))
        ip_region = self._get_dummies("ip_region").astype(int)
        usu_tipo = self._get_dummies("usu_tipo").astype(int)
        usu_tamanio = self._get_dummies("usu_tamanio").astype(int)
        usu_estado = self._get_dummies("usu_estado").astype(int)
        usu_departamento = self._get_dummies("usu_departamento").astype(int)
        weekday_registro = self._get_dummies("weekday_registro").astype(int)
        workday_registro = self.df_usuarios.apply(lambda x: 1 if x["workday_registro"] is True else 0, axis=1).astype(int)
        #print(len(workday_registro.index))
        # weekday_alta, workday_alta, weekday_cliente, workday_cliente?

        phone_zone = self._get_dummies("phone_zone").astype(int)
        phone_type = self._get_dummies("phone_type").astype(int)
        region = self._get_dummies("region").astype(int)
        sub_region = self._get_dummies("sub-region").astype(int)
        intermediate_region = self._get_dummies("intermediate-region").astype(int)
        section = self._get_dummies("section").astype(int)
        division = self._get_dummies("division").astype(int)
        group = self._get_dummies("group").astype(int)
        _class = self._get_dummies("class").astype(int)
        
        df =  pd.concat([tipousuario, canal_registro, tipoemail, bonad_email, 
                ipcasos, ip_country, ip_region, usu_tipo, usu_tamanio, usu_estado, 
                usu_departamento, weekday_registro, workday_registro, phone_zone, 
                phone_type, region, sub_region, intermediate_region, section, division, group, 
                _class, total_consumos, visit_days, visits_per_day, fichas_basicas, 
                perfil_promocional, same_section, same_division, same_group, same_class], axis=1)
        #print(len(df.index))
        return df
    
    def test_importance(self, df, target):
        p_value = fs.chi2(df, target)[1]
        return pd.DataFrame({"cols": df.columns, "p_value": p_value})


# Get data

In [None]:
df_usuarios = pd.read_sql("""
SELECT *
FROM usuarios_extra_features
WHERE total_consumos IS NOT NULL
""", con=engine)

In [None]:
len(df_usuarios.index)

# Prepare data for model

In [None]:
ew = EcommerceWraper(df_usuarios)

In [None]:
dummy = ew.dummy_encode()

In [None]:
p_value = fs.chi2(dummy, ew.get_target_variable())[1]
desired = pd.DataFrame({"cols": dummy.columns, "p_value": p_value}).sort_values(by='p_value').query("p_value < 0.05")
desired

In [None]:
dummy_selected = dummy[desired.cols.values]

In [None]:
X_train, X_test, y_train, y_test = ew.train_test_split(dummy_selected, test_size=0.3)

In [None]:
models_svm = {
    'SVC': SVC()
}

params_svm = {
    'SVC': [
        {'kernel': ['linear'], 'C': [0.01, 0.1, 1, 10, 50, 100]},
        {'kernel': ['rbf'], 'C': [0.01, 0.1, 1, 10, 50, 100], 'gamma': [0.001, 0.0001, 0.01, 0.1]},
        {'kernel': ['sigmoid'], 'C': [0.01, 0.1, 1, 10, 50, 100], 'gamma': [0.001, 0.0001, 0.01, 0.1]}
    ]
}



In [None]:
helper1 = EstimatorSelectionHelper(models_svm, params_svm, scoring=["accuracy", "f1", "recall"])
helper1.fit(X_train, y_train, n_jobs=10)

In [None]:
scores = helper1.score_summary()
scores.sort_values(by="mean_score_accuracy", ascending=False).head(10)

In [None]:
pcaw = PCAw()
dummy_pca = pcaw.fit_transform(dummy[desired.cols.values])

In [None]:
fig, ax = plt.subplots(figsize=(40, 5))
pcaw.variance_plot(ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
sns.scatterplot(x=dummy_pca[:,2], y=dummy_pca[:, 3], hue=ew.get_target_variable(), ax=ax)

In [None]:
pd.get_dummies(df_usuarios[["tipousuario"]].astype(str), prefix="tipousuario").isna().sum()

In [None]:
scaled = prepro.StandardScaler().fit_transform(df_usuarios["ipcasos"].values.reshape(-1, 1))
pd.DataFrame(scaled, columns=["ipcasos"]).isna().sum()

https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/