In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *

In [2]:
#reading in sticker type data
feature_data = spark.read.parquet("gs://ds-url-catag/Stickers/stick_statistics/sticker_download/derived_features/")


In [3]:
#convert pyspark to pandas dataframe
import pandas as pd
feature_data_df  = feature_data.toPandas()

In [4]:
#checking size of dataframe
print 'Size of the dataframe: {}'.format(feature_data_df.shape)

Size of the dataframe: (2422112, 9)


In [6]:
#logarithmic transformation of two different variables
import numpy as np
feature_data_df['log_noofdays']=np.log10(1+feature_data_df.numofdays)
feature_data_df['log_avg_con_days']=np.log10(1+feature_data_df.avg_con_days)
feature_data_df['log_sticker_packs_download']=np.log10(1+feature_data_df.sticker_packs_download)
feature_data_df['log_distinct_sticker_packs_download']=np.log10(1+feature_data_df.distinct_sticker_packs_download)
feature_data_df['log_sum_paid']=np.log10(1+feature_data_df.sum_paid)
feature_data_df['log_sum_free']=np.log10(1+feature_data_df.sum_free)
feature_data_df['log_sum_subs']=np.log10(1+feature_data_df.sum_subs)
feature_data_df['log_sum_discont']=np.log10(1+feature_data_df.sum_discont)


In [8]:
#computing PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(feature_data_df[['log_noofdays','log_avg_con_days','log_sticker_packs_download','log_distinct_sticker_packs_download','log_sum_paid','log_sum_free','log_sum_subs','log_sum_discont']].values)

feature_data_df['pca-one'] = pca_result[:,0]
feature_data_df['pca-two'] = pca_result[:,1] 
feature_data_df['pca-three'] = pca_result[:,2]



print 'Explained variation per principal component: {}'.format(pca.explained_variance_ratio_)


Explained variation per principal component: [0.84976502 0.06767649 0.03883627]


In [9]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
model = kmeans.fit(feature_data_df[['pca-one','pca-two','pca-three']])
y_kmeans = kmeans.predict(feature_data_df[['pca-one','pca-two','pca-three']])
centers = kmeans.cluster_centers_

In [10]:
feature_data_df['clusters'] =  kmeans.labels_

In [11]:
original_data = sqlContext.createDataFrame(feature_data_df)

In [12]:
#distributionofclusters
data = original_data.groupby('clusters').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------+------+------------------+
|clusters|   cnt|        percentage|
+--------+------+------------------+
|       2|821266| 33.90701998916648|
|       0|662475|27.351129922976313|
|       4|452372| 18.67675813504908|
|       1|377716|15.594489437317517|
|       3|108283| 4.470602515490613|
+--------+------+------------------+



In [13]:
aggregated_demo_stick = sqlContext.read.parquet("gs://ds-url-catag/clustering/user_profile_data/")

In [14]:
aggregated_demo_stick.columns

['user_id_n',
 'city',
 'Country',
 'age_group',
 'gender',
 'Device_Brand',
 'Device_Model',
 'OS_version',
 'platform',
 'count']

In [15]:
new_data = original_data.join(aggregated_demo_stick,aggregated_demo_stick.user_id_n == original_data.user_id_n).select([original_data.user_id_n,original_data.clusters,original_data.numofdays,original_data.avg_con_days,original_data.sticker_packs_sent,original_data.distinct_sticker_packs_sent,original_data.sum_paid,original_data.sum_free,original_data.sum_subs,original_data.sum_discont]+[aggregated_demo_stick.city,aggregated_demo_stick.Country,aggregated_demo_stick.age_group,aggregated_demo_stick.gender,aggregated_demo_stick.Device_Brand,aggregated_demo_stick.Device_Model,aggregated_demo_stick.OS_version,aggregated_demo_stick.platform])

In [None]:
# cluster_8.groupby('clusters','numofdays','avg_con_days','sticker_packs_sent','distinct_sticker_packs_sent','sum_paid','sum_free','sum_subs').count().sort(col("clusters").desc()).show(1000)

In [147]:
cluster_1 = new_data.where(col('clusters')==1)

In [148]:
from pyspark.sql.functions import mean, min, max

In [149]:
cluster_1.select([mean('numofdays'), min('numofdays'), max('numofdays')]).show()

+-----------------+--------------+--------------+
|   avg(numofdays)|min(numofdays)|max(numofdays)|
+-----------------+--------------+--------------+
|8.457347912529633|             1|            26|
+-----------------+--------------+--------------+



In [150]:
cluster_1.select([mean('avg_con_days'), min('avg_con_days'), max('avg_con_days')]).show()

+-----------------+-----------------+-----------------+
|avg(avg_con_days)|min(avg_con_days)|max(avg_con_days)|
+-----------------+-----------------+-----------------+
| 1.41172437765587|              0.0|             10.0|
+-----------------+-----------------+-----------------+



In [151]:
cluster_1.select([mean('sticker_packs_sent'), min('sticker_packs_sent'), max('sticker_packs_sent')]).show()

+-----------------------+-----------------------+-----------------------+
|avg(sticker_packs_sent)|min(sticker_packs_sent)|max(sticker_packs_sent)|
+-----------------------+-----------------------+-----------------------+
|     23.352051266939537|                      6|                     76|
+-----------------------+-----------------------+-----------------------+



In [152]:
cluster_1.select([mean('distinct_sticker_packs_sent'), min('distinct_sticker_packs_sent'), max('distinct_sticker_packs_sent')]).show()

+--------------------------------+--------------------------------+--------------------------------+
|avg(distinct_sticker_packs_sent)|min(distinct_sticker_packs_sent)|max(distinct_sticker_packs_sent)|
+--------------------------------+--------------------------------+--------------------------------+
|               6.064450519777776|                               3|                              49|
+--------------------------------+--------------------------------+--------------------------------+



In [153]:
cluster_1.select([mean('sum_paid'), min('sum_paid'), max('sum_paid')]).show()

+-------------------+-------------+-------------+
|      avg(sum_paid)|min(sum_paid)|max(sum_paid)|
+-------------------+-------------+-------------+
|0.00873212230263419|            0|           14|
+-------------------+-------------+-------------+



In [154]:
cluster_1.select([mean('sum_free'), min('sum_free'), max('sum_free')]).show()

+-----------------+-------------+-------------+
|    avg(sum_free)|min(sum_free)|max(sum_free)|
+-----------------+-------------+-------------+
|5.091552573750952|            1|           44|
+-----------------+-------------+-------------+



In [155]:
cluster_1.select([mean('sum_subs'), min('sum_subs'), max('sum_subs')]).show()

+--------------------+-------------+-------------+
|       avg(sum_subs)|min(sum_subs)|max(sum_subs)|
+--------------------+-------------+-------------+
|1.311021117062158E-4|            0|            9|
+--------------------+-------------+-------------+



In [156]:
cluster_1.select([mean('sum_discont'), min('sum_discont'), max('sum_discont')]).show()

+-------------------+----------------+----------------+
|   avg(sum_discont)|min(sum_discont)|max(sum_discont)|
+-------------------+----------------+----------------+
|0.06770882822375883|               0|               7|
+-------------------+----------------+----------------+

