In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score 
# On importe la métrique "silhouette" qui nous servira à déterminer le nombre de clusters optimal.

%config InlineBackend.figure_format = 'png'

In [2]:
df = pd.read_csv('data/df_train.csv')

In [3]:
len(df)

1855314

In [4]:
# Recodage de certaines modalités afin de réordonner par ordre croissant d'engagement
df["contextid"] = df["contextid"].replace(0,1)
df["contextid"] = df["contextid"].replace(10,3)

# Remplacement des valeurs manquantes par des 0 dans les variables commençant par ltf
df['ltf_lastpartnerclicktimestamp'] = df['ltf_lastpartnerclicktimestamp'].fillna(0)
df['ltf_nbglobalclick_4w'] = df['ltf_nbglobalclick_4w'].fillna(0)
df['ltf_nbglobaldisplay_4w'] = df['ltf_nbglobaldisplay_4w'].fillna(0)
df['ltf_nbglobaldisplaysincelastpartnerproductview'] = df['ltf_nbglobaldisplaysincelastpartnerproductview'].fillna(0)
df['ltf_nbpartnerdisplayssincelastclick'] = df['ltf_nbpartnerdisplayssincelastclick'].fillna(0)
df['ltf_nbpartnerclick_4w'] = df['ltf_nbpartnerclick_4w'].fillna(0)
df['ltf_nbpartnerdisplay_4w'] = df['ltf_nbpartnerdisplay_4w'].fillna(0)
df['ltf_nbpartnersales_4w'] = df['ltf_nbpartnersales_4w'].fillna(0)
df['ltf_nbpartnerdisplay_90d'] = df['ltf_nbpartnerdisplay_90d'].fillna(0)
df['ltf_nbpartnerclick_90d'] = df['ltf_nbpartnerclick_90d'].fillna(0)
df['ltf_nbpartnersales_90d'] = df['ltf_nbpartnersales_90d'].fillna(0)
# Idem ici
df['nbdisplay_1hour'] = df['nbdisplay_1hour'].fillna(0)
df['nbdisplaypartnerapprox_1d_sum_xdevice'] = df['nbdisplaypartnerapprox_1d_sum_xdevice'].fillna(0)
df['nbdisplayaffiliateapprox_1d_sum_xdevice'] = df['nbdisplayaffiliateapprox_1d_sum_xdevice'].fillna(0)
df['nbdisplayglobalapprox_1d_sum_xdevice'] = df['nbdisplayglobalapprox_1d_sum_xdevice'].fillna(0)	
df['campaignctrlast24h'] = df['campaignctrlast24h'].fillna(0)

In [5]:
# Suppression de l'appareil cliquant un nombre aberrant de fois
df.drop(df.index[df['hashed_xd_id'] == r"b'\x12\xb97|\xbe~\\\x94\xe8\xa7\r\x9d#\x92\x95#\xd1J\xfa\x95G\x93\x13\x0f\x8a9Y\xc7\xb8I\xac\xa8'"], inplace = True)

In [6]:
df['display_size'] = df['display_width']*df['display_height']

In [7]:
# On enlève des variables ltf grâce aux corrélations visibles dans la matrice. 
df_clusters = df[['contextid','dayssincelastvisitdouble','campaignctrlast24h','nbdisplay_1hour','nbdayssincelastclick','display_size','ltf_nbpartnerdisplayssincelastclick','ltf_nbglobaldisplay_4w','ltf_nbpartnerclick_90d','ltf_nbpartnerdisplay_90d','ltf_nbpartnersales_90d','nbdisplayglobalapprox_1d_sum_xdevice']]

In [8]:
# On va chercher à obtenir des groupes d'observations de caractéristiques proches afin de voir si l'on peut distinguer les clics des non-clics. 
sc = StandardScaler()
Z = sc.fit_transform(df_clusters)
Z

array([[ 0.14873565, -0.40809959, -0.8906595 , ..., -0.07290974,
        -0.01397608, -0.32103262],
       [-1.92104218, -0.44867976, -0.360678  , ..., -0.07290974,
        -0.01397608,  0.8070464 ],
       [ 0.14873565, -0.33748309,  1.82717896, ..., -0.07290974,
        -0.01397608, -0.5009203 ],
       ...,
       [ 0.97664678, -0.36785013, -0.92834265, ..., -0.07290974,
        -0.01397608, -0.51352702],
       [ 1.39060235, -0.4168148 ,  2.20278178, ..., -0.07290974,
        -0.01397608, -0.34134216],
       [ 0.14873565, -0.1664472 , -0.50158415, ..., -0.07290974,
        -0.01397608, -0.45594838]])

In [9]:
# Vérification que les moyennes sont nulles et les écarts-types unitaires. 
print(np.mean(Z, axis=0))
print(np.std(Z, axis=0, ddof=0))

[-4.69520271e-17  2.45057639e-16 -5.90496082e-17 -2.05773590e-17
 -9.95337090e-18  1.23711223e-17  1.66154232e-17 -1.56712648e-17
  9.90263342e-18  4.87962244e-18 -2.46683444e-18 -3.14861388e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [10]:
# On applique la méthode des k-means sur les variables centrées et réduites, en supposant pour l'instant par défaut qu'il existe 2 clusters.
kmeans = KMeans(n_clusters=2, random_state=146355806) 
# random_state est la graine aléatoire specifiée pour rendre les résultats reproductibles.
kmeans.fit(Z)

# On trie les index en fonction des différents groupes de députés (ie des clusters).
idk = np.argsort(kmeans.labels_)

# Affichage des observations et de leurs groupes.
print(pd.DataFrame({"Observation" : df_clusters.index[idk], "Cluster" : kmeans.labels_[idk]}))

         Observation  Cluster
0                  0        0
1            1236366        0
2            1236364        0
3            1236363        0
4            1236362        0
5            1236360        0
6            1236359        0
7            1236358        0
8            1236357        0
9            1236356        0
10           1236368        0
11           1236355        0
12           1236352        0
13           1236350        0
14           1236349        0
15           1236347        0
16           1236346        0
17           1236345        0
18           1236344        0
19           1236343        0
20           1236342        0
21           1236354        0
22           1236369        0
23           1236370        0
24           1236371        0
25           1236392        0
26           1236391        0
27           1236390        0
28           1236389        0
29           1236388        0
...              ...      ...
1610464      1122020        1
1610465   

In [11]:
nb_clusters = kmeans.labels_.tolist()
print(nb_clusters.count(0)) # Comptage du nombre de députés dans le cluster 1.
print(nb_clusters.count(1)) # Comptage du nombre de députés dans le cluster 2.

1604453
6041


Le découpage en deux clusters ne nous permettra pas de discriminer les clics des non-clics ; le deuxième cluster est en effet trop petit. 
Essayons avec 3 clusters. 

In [12]:
# On va faire varier le nombre de clusters dans un intervalle réaliste, disons de 2 à 10.
# res = np.arange(9, dtype="double")
# for k in np.arange(9):
    # km = KMeans(n_clusters=k+2)
    # km.fit(Z)
    # res[k] = silhouette_score(Z, km.labels_)
# print(res)

In [13]:
# plt.figure(figsize=(8,4))
# plt.plot(np.arange(2,11,1), res)
# plt.title("Coefficient de silhouette moyen en fonction du nombre de clusters", size=16)
# plt.xlabel("Nombre de clusters", size=10)
# plt.ylabel("Score de silhouette moyen", size=10)
# plt.show()

In [14]:
kmeans = KMeans(n_clusters=3, random_state=146355806) 
kmeans.fit(Z)
idk = np.argsort(kmeans.labels_)
print(pd.DataFrame({"Observation" : df_clusters.index[idk], "Cluster" : kmeans.labels_[idk]}))

         Observation  Cluster
0            1181351        0
1             238204        0
2             238205        0
3             238208        0
4            1463589        0
5            1656917        0
6            1463590        0
7            1146803        0
8            1285265        0
9             835818        0
10           1656908        0
11            464477        0
12           1656903        0
13            238225        0
14           1285262        0
15            464486        0
16            464471        0
17            835808        0
18            672321        0
19            672322        0
20           1463603        0
21           1146819        0
22           1285259        0
23            238239        0
24           1656890        0
25           1463607        0
26            672326        0
27           1146825        0
28            238249        0
29           1656881        0
...              ...      ...
1610464       669199        2
1610465   

In [15]:
nb_clusters1 = kmeans.labels_.tolist()
print(nb_clusters1.count(0)) 
print(nb_clusters1.count(1))
print(nb_clusters1.count(2))

239141
6029
1365324


Etudions les caractéristiques des observations dans chacun des clusters. 

In [16]:
A = pd.DataFrame({"Observation" : df_clusters.index[idk], "Cluster" : kmeans.labels_[idk]})
L1 = [] # Liste qui contiendra les index des députés du cluster 1.
L2 = [] # Liste qui contiendra les index des députés du cluster 2.
L3 = [] # Liste qui contiendra les index des députés du cluster 3.
for i in range(len(A)) :
    if A['Cluster'][i] == 0 :
        L1.append(A['Observation'][i])
    if A['Cluster'][i] == 1 :
        L2.append(A['Observation'][i])
    if A['Cluster'][i] == 2 : 
        L3.append(A['Observation'][i])

In [17]:
df_cluster_1 = df.loc[L1] # Sous-table avec les députés du cluster 1.
df_cluster_2 = df.loc[L2] # Sous-table avec les députés du cluster 2.
df_cluster_3 = df.loc[L3] # Sous-table avec les députés du cluster 3.

In [18]:
len(L1), len(L2), len(L3)

(239141, 6029, 1365324)

On retrouve bien le nombre d'observations présentes dans chaque cluster. 

In [19]:
display(pd.DataFrame({'Nb dans cluster 1' : df_cluster_1['is_display_clicked'].value_counts(), 'Prop dans cluster 1' : df_cluster_1['is_display_clicked'].value_counts(normalize=True)})) 
# Répartition des clics et des non clics dans le cluster 1..
display(pd.DataFrame({'Nb dans cluster 2' : df_cluster_2['is_display_clicked'].value_counts(), 'Prop dans cluster 2' : df_cluster_2['is_display_clicked'].value_counts(normalize=True)})) 
# Répartition des clics et des non clics dans le cluster 2.
display(pd.DataFrame({'Nb dans cluster 3' : df_cluster_3['is_display_clicked'].value_counts(), 'Prop dans cluster 3' : df_cluster_3['is_display_clicked'].value_counts(normalize=True)}))
# Répartition des clics et des non clics dans le cluster 3.

Unnamed: 0,Nb dans cluster 1,Prop dans cluster 1
0,230664,0.964552
1,8477,0.035448


Unnamed: 0,Nb dans cluster 2,Prop dans cluster 2
0,5583,0.926024
1,446,0.073976


Unnamed: 0,Nb dans cluster 3,Prop dans cluster 3
0,1282495,0.939334
1,82829,0.060666


In [20]:
df_cluster_1.describe()

Unnamed: 0.1,Unnamed: 0,contextid,target_env,rtbtypeid,rtbadvisibility,rtb_detectedlanguage,urlhash2,urlhash3,urlhash4,googleviewability,...,nbdisplaypartnerapprox_1d_sum_xdevice,nbdisplayaffiliateapprox_1d_sum_xdevice,nbdisplayglobalapprox_1d_sum_xdevice,valueperclick,display_width,display_height,display_timestamp,is_display_clicked,zonecostineuro,display_size
count,239141.0,239141.0,239141.0,237865.0,237865.0,237865.0,207539.0,207539.0,207539.0,64935.0,...,239141.0,239141.0,239141.0,239141.0,239141.0,239141.0,239141.0,239141.0,239141.0,239141.0
mean,931869.2,6.03766,1.073275,37.0689,0.729355,94.218649,110581100.0,-176240200.0,-187409300.0,59.410919,...,35.259333,58.694222,126.633865,0.311313,362.944589,250.710196,1602853000.0,0.035448,0.935272,91964.04
std,537753.1,2.455203,0.260588,33.767617,0.791453,1037.636807,1217655000.0,1124695000.0,1153933000.0,34.503522,...,20.218316,87.052084,122.536115,0.189056,277.651188,234.852831,177578.4,0.184909,2.53871,94564.07
min,3.0,1.0,1.0,3.0,0.0,-1.0,-2145916000.0,-2147307000.0,-2147255000.0,-1.0,...,0.0,0.0,13.0,0.034118,0.0,0.0,1602547000.0,0.0,0.0,0.0
25%,466254.0,6.0,1.0,4.0,0.0,12.0,-846280300.0,-1089605000.0,-1089605000.0,30.0,...,21.0,10.826296,47.265278,0.136416,300.0,50.0,1602694000.0,0.0,0.05,16000.0
50%,936287.0,6.0,1.0,29.0,1.0,12.0,364099700.0,-218081400.0,-523754600.0,68.0,...,34.0,28.0,83.75125,0.254042,300.0,250.0,1602849000.0,0.0,0.254518,75000.0
75%,1397287.0,8.0,1.0,73.0,1.0,127.0,1340720000.0,640425400.0,639792700.0,90.0,...,48.868148,67.0,169.42,0.512843,360.0,480.0,1603022000.0,0.0,0.822104,180000.0
max,1856864.0,9.0,2.0,134.0,2.0,31770.0,2146753000.0,2147353000.0,2147359000.0,100.0,...,460.27787,2269.557407,3953.888889,0.790542,1280.0,1286.0,1603152000.0,1.0,100.0,1028800.0


In [21]:
df_cluster_2.describe()

Unnamed: 0.1,Unnamed: 0,contextid,target_env,rtbtypeid,rtbadvisibility,rtb_detectedlanguage,urlhash2,urlhash3,urlhash4,googleviewability,...,nbdisplaypartnerapprox_1d_sum_xdevice,nbdisplayaffiliateapprox_1d_sum_xdevice,nbdisplayglobalapprox_1d_sum_xdevice,valueperclick,display_width,display_height,display_timestamp,is_display_clicked,zonecostineuro,display_size
count,6029.0,6029.0,6029.0,6012.0,6012.0,6012.0,5379.0,5379.0,5379.0,1860.0,...,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0
mean,921712.1,7.374026,1.137502,34.008816,0.738689,69.295576,137053400.0,-90721270.0,-138581800.0,60.796237,...,25.175368,30.955367,82.695714,0.306986,327.919721,241.687842,1602856000.0,0.073976,1.567993,88429.49
std,538089.7,1.841765,0.344405,32.836823,0.794307,562.08658,1259048000.0,1138867000.0,1155705000.0,30.823743,...,21.053366,65.393863,99.7675,0.18924,266.864481,231.25126,176735.0,0.261753,3.776451,97440.88
min,94.0,1.0,1.0,3.0,0.0,1.0,-2140960000.0,-2146561000.0,-2146412000.0,-1.0,...,0.0,0.0,0.0,0.034118,0.0,0.0,1602550000.0,0.0,0.008475,0.0
25%,458164.0,6.0,1.0,4.0,0.0,12.0,-899849500.0,-1005008000.0,-1089605000.0,41.0,...,6.901435,2.0,28.0,0.13905,160.0,50.0,1602698000.0,0.0,0.118,16000.0
50%,918842.0,8.0,1.0,16.0,1.0,16.0,364099700.0,-61629820.0,-262796200.0,68.0,...,21.395,10.146481,54.358981,0.245336,300.0,250.0,1602850000.0,0.0,0.471,75000.0
75%,1389878.0,9.0,1.0,73.0,1.0,127.0,1384186000.0,728231300.0,656322500.0,86.0,...,41.0,32.0,98.46875,0.51222,336.0,300.0,1603023000.0,0.0,1.42,153600.0
max,1856705.0,9.0,2.0,134.0,2.0,30724.0,2141561000.0,2146829000.0,2144924000.0,99.0,...,193.195278,1341.353333,1416.062778,0.790542,1280.0,1280.0,1603149000.0,1.0,84.459663,1024000.0


In [22]:
df_cluster_3.describe()

Unnamed: 0.1,Unnamed: 0,contextid,target_env,rtbtypeid,rtbadvisibility,rtb_detectedlanguage,urlhash2,urlhash3,urlhash4,googleviewability,...,nbdisplaypartnerapprox_1d_sum_xdevice,nbdisplayaffiliateapprox_1d_sum_xdevice,nbdisplayglobalapprox_1d_sum_xdevice,valueperclick,display_width,display_height,display_timestamp,is_display_clicked,zonecostineuro,display_size
count,1365324.0,1365324.0,1365324.0,1359706.0,1359706.0,1359706.0,1246628.0,1246628.0,1246628.0,434862.0,...,1365324.0,1365324.0,1365324.0,1365324.0,1365324.0,1365324.0,1365324.0,1365324.0,1365324.0,1365324.0
mean,928730.8,5.563513,1.110386,36.55437,0.7276036,94.72602,97444250.0,-72757860.0,-108930300.0,59.22234,...,10.71949,10.62098,29.84569,0.2532534,285.7652,224.1796,1602857000.0,0.06066619,1.140309,80852.65
std,535243.2,2.401246,0.3133698,33.39118,0.8085108,983.741,1260949000.0,1157488000.0,1173646000.0,30.52539,...,13.8488,20.35892,34.24088,0.1837486,249.8844,227.649,174597.4,0.2387171,3.030289,97538.06
min,0.0,1.0,1.0,3.0,0.0,-1.0,-2147479000.0,-2147454000.0,-2147479000.0,-1.0,...,0.0,0.0,0.0,0.03411759,0.0,0.0,1602547000.0,0.0,0.0,0.0
25%,465844.8,5.0,1.0,4.0,0.0,12.0,-971882000.0,-993561600.0,-1017692000.0,37.0,...,1.0,0.0,6.0,0.1029036,0.0,0.0,1602700000.0,0.0,0.079,0.0
50%,927781.5,6.0,1.0,29.0,0.0,16.0,260310700.0,-61629820.0,-285648400.0,66.0,...,5.0,3.0,17.51211,0.183839,300.0,250.0,1602855000.0,0.0,0.3180814,75000.0
75%,1392135.0,7.0,1.0,73.0,1.0,127.0,1384186000.0,834992600.0,783082800.0,85.0,...,14.69028,11.0,41.0,0.4015411,320.0,280.0,1603021000.0,0.0,0.9850001,100000.0
max,1856872.0,9.0,2.0,134.0,2.0,31770.0,2147366000.0,2147451000.0,2147451000.0,100.0,...,189.0,303.6156,305.2078,0.7905422,1374.0,1286.0,1603155000.0,1.0,100.0,1101600.0
