In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import seaborn as sns
import missingno as msno
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.impute import KNNImputer
import statsmodels.formula.api
import statsmodels.api
import six
import dataframe_image as dfi
import scipy.stats as stats

plt.style.use('seaborn-whitegrid')
#options d'affichage
pd.set_option('display.max_row', 200)
pd.set_option('display.max_column', 70)

In [10]:
#quelques fonctions du cours pour présenter proprement les graphs et projections.


def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=(7,6))

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # affichage du cercle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)
        
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # initialisation de la figure       
            fig = plt.figure(figsize=(7,6))
        
            # affichage des points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
                plt.legend()

            # affichage des labels des points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # détermination des limites du graphique
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # affichage des lignes horizontales et verticales
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection des individus (sur F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)

def display_scree_plot(pca):
    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage d'inertie")
    plt.title("Eboulis des valeurs propres")
    plt.show(block=False)

In [11]:
data = pd.read_pickle("cleaned_data")

In [12]:
data.head()

Unnamed: 0,order_status,price,freight_value,payment_sequential,payment_type,payment_installments,payment_value,review_score,customer_unique_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_city,seller_state,geolocation_lat,geolocation_lng,time_delivery_customer,company_processing_time,estimation_error_time,purchase_approval_time,order_purchase_year,order_purchase_month,order_purchase_dayofweek,order_purchase_hour,order_purchase_day,order_purchase_mon,month_year
0,delivered,29.99,8.72,1.0,credit_card,1.0,18.12,4.0,7c396fd4830fd04220f754e42b4e5bff,housewares,40.0,268.0,4.0,500.0,19.0,8.0,13.0,maua,SP,-23.577482,-46.587077,202.477778,56.795833,170.579722,0.178333,2017,10,0,10,Mon,Oct,201710
1,delivered,29.99,8.72,3.0,voucher,1.0,2.0,4.0,7c396fd4830fd04220f754e42b4e5bff,housewares,40.0,268.0,4.0,500.0,19.0,8.0,13.0,maua,SP,-23.577482,-46.587077,202.477778,56.795833,170.579722,0.178333,2017,10,0,10,Mon,Oct,201710
2,delivered,29.99,8.72,2.0,voucher,1.0,18.59,4.0,7c396fd4830fd04220f754e42b4e5bff,housewares,40.0,268.0,4.0,500.0,19.0,8.0,13.0,maua,SP,-23.577482,-46.587077,202.477778,56.795833,170.579722,0.178333,2017,10,0,10,Mon,Oct,201710
3,delivered,118.7,22.76,1.0,boleto,1.0,141.46,4.0,af07308b275d755c9edb36a90c618231,perfumery,29.0,178.0,1.0,400.0,19.0,13.0,19.0,belo horizonte,SP,-12.186877,-44.540232,330.768889,11.109167,128.5375,30.713889,2018,7,1,20,Tue,Jul,201807
4,delivered,159.9,19.22,1.0,credit_card,3.0,179.12,5.0,3a653a41f6f9fc3d2a113cf8398680e8,auto,46.0,232.0,1.0,420.0,24.0,19.0,21.0,guariba,SP,-16.74515,-48.514783,225.461111,4.910278,413.891944,0.276111,2018,8,2,8,Wed,Aug,201808


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114540 entries, 0 to 119142
Data columns (total 32 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   order_status                114540 non-null  object 
 1   price                       114540 non-null  float64
 2   freight_value               114540 non-null  float64
 3   payment_sequential          114540 non-null  float64
 4   payment_type                114540 non-null  object 
 5   payment_installments        114540 non-null  float64
 6   payment_value               114540 non-null  float64
 7   review_score                114540 non-null  float64
 8   customer_unique_id          114540 non-null  object 
 9   product_category_name       114540 non-null  object 
 10  product_name_lenght         114540 non-null  float64
 11  product_description_lenght  114540 non-null  float64
 12  product_photos_qty          114540 non-null  float64
 13  product_weight

In [14]:
num_data = data.select_dtypes(include=['int64','float64'])
cat_data = data.select_dtypes(exclude=['int64','float64'])

In [15]:
num_data.columns

Index(['price', 'freight_value', 'payment_sequential', 'payment_installments',
       'payment_value', 'review_score', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'geolocation_lat', 'geolocation_lng', 'time_delivery_customer',
       'company_processing_time', 'estimation_error_time',
       'purchase_approval_time', 'order_purchase_year', 'order_purchase_month',
       'order_purchase_dayofweek', 'order_purchase_hour', 'month_year'],
      dtype='object')

In [24]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer

target_features = ['order_status', 'payment_type', 'customer_unique_id',
       'product_category_name', 'seller_city', 'seller_state']
target_transformer = TargetEncoder()

numeric_features = ['price', 'freight_value', 'payment_sequential', 'payment_installments',
       'payment_value', 'review_score', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'geolocation_lat', 'geolocation_lng', 'time_delivery_customer',
       'company_processing_time', 'estimation_error_time',
       'purchase_approval_time', 'order_purchase_year', 'order_purchase_month',
       'order_purchase_dayofweek', 'order_purchase_hour', 'month_year']
numeric_transformer = RobustScaler(unit_variance=True)

preprocessor = ColumnTransformer(transformers=[
    ('target', target_transformer, target_features),
    ('numeric', numeric_transformer, numeric_features)
])

In [27]:
import yellowbrick
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn import cluster

In [28]:
from sklearn.pipeline import Pipeline
n_comp = 20
n_clust = 6
pca_pip = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', decomposition.PCA(n_components=n_comp)),
    ('clust', cluster.KMeans(n_clusters=n_clust))])

In [29]:
pca_pip.fit(data)

TypeError: fit_transform() missing argument: y

## ACP

In [31]:
acp_data = preprocessor.fit(data)
X_scaled = acp_data.values

TypeError: fit_transform() missing argument: y

In [None]:
# choix du nombre de composantes à calculer: on essaye de se ramener à deux plans maximum.
n_comp = 16

# préparation des données pour l'ACP
names = acp_data.index 
features = acp_data.columns


# Calcul des composantes principales
pca = decomposition.PCA(n_components=n_comp)
pca.fit(X_scaled)

# Eboulis des valeurs propres
display_scree_plot(pca)

# Cercle des corrélations
pcs = pca.components_
display_circles(pcs, n_comp, pca, [(0,1),(2,3),(4,5),(6,7),(8,9),(10,11),(12,13),(14,15)], labels = np.array(features))

plt.show()

In [None]:
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

In [None]:
# Projection des individus
X_projected = pca.transform(X)
display_factorial_planes(X_projected[np.random.choice(X_projected.shape[0], 200, replace=False), :],
                         12, pca, [(0,1),(2,3),(4,5),(6,7),(8,9),(10,11)], labels = np.array(names))

## K_MEANS

In [None]:
import yellowbrick
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer


In [None]:
from sklearn import cluster
myclust = cluster.KMeans(n_clusters=10)
myclust.fit(X_scaled)

In [None]:
model = cluster.KMeans()
visualizer = KElbowVisualizer(model, k=(1,30))

visualizer.fit(X_scaled)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data

In [None]:
# Instantiate the clustering model and visualizer 
model = cluster.KMeans(10)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_scaled)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data


## hiérarchique

## DBscan

## t-sne