In [13]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *

In [2]:
#reading in sticker type data
feature_data = spark.read.parquet("gs://ds-url-catag/Stickers/stick_statistics/sticker_download/derived_features/")


In [3]:
#convert pyspark to pandas dataframe
import pandas as pd
feature_data_df  = feature_data.toPandas()

In [4]:
#checking size of dataframe
print 'Size of the dataframe: {}'.format(feature_data_df.shape)

Size of the dataframe: (2422112, 9)


In [5]:
#logarithmic transformation of two different variables
import numpy as np
feature_data_df['log_noofdays']=np.log10(1+feature_data_df.numofdays)
feature_data_df['log_avg_con_days']=np.log10(1+feature_data_df.avg_con_days)
feature_data_df['log_sticker_packs_download']=np.log10(1+feature_data_df.sticker_packs_download)
feature_data_df['log_distinct_sticker_packs_download']=np.log10(1+feature_data_df.distinct_sticker_packs_download)
feature_data_df['log_sum_paid']=np.log10(1+feature_data_df.sum_paid)
feature_data_df['log_sum_free']=np.log10(1+feature_data_df.sum_free)
feature_data_df['log_sum_subs']=np.log10(1+feature_data_df.sum_subs)
feature_data_df['log_sum_discont']=np.log10(1+feature_data_df.sum_discont)


In [6]:
#computing PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(feature_data_df[['log_noofdays','log_avg_con_days','log_sticker_packs_download','log_distinct_sticker_packs_download','log_sum_paid','log_sum_free','log_sum_subs','log_sum_discont']].values)

feature_data_df['pca-one'] = pca_result[:,0]
feature_data_df['pca-two'] = pca_result[:,1] 
feature_data_df['pca-three'] = pca_result[:,2]



print 'Explained variation per principal component: {}'.format(pca.explained_variance_ratio_)


Explained variation per principal component: [ 0.84976502  0.06767649  0.03883627]


In [7]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
model = kmeans.fit(feature_data_df[['pca-one','pca-two','pca-three']])
y_kmeans = kmeans.predict(feature_data_df[['pca-one','pca-two','pca-three']])
centers = kmeans.cluster_centers_

In [8]:
feature_data_df['clusters'] =  kmeans.labels_

In [9]:
original_data = sqlContext.createDataFrame(feature_data_df)

In [13]:
# original_data.columns

In [10]:
main_data = original_data.select('user_id_n','numofdays','avg_con_days','sticker_packs_download','distinct_sticker_packs_download','sum_paid','sum_free','sum_subs','sum_discont','clusters')

In [12]:
main_data.select('avg_con_days').describe().show()


+-------+-------------------+
|summary|       avg_con_days|
+-------+-------------------+
|  count|            2422112|
|   mean|0.09567270217066759|
| stddev|0.34783412371311795|
|    min|                0.0|
|    max|               48.0|
+-------+-------------------+



In [196]:
# calculate average sticker pack download to distinguish the clusters 
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank, col
window = Window.partitionBy('clusters')

main_data = main_data.withColumn('Avg_stick_download',avg(col('sticker_packs_download')).over(window))

#categorize ability to pay
main_data = main_data.withColumn('Abilty_to_pay',when(col('Avg_stick_download') > 43.0,'Very High').otherwise(when(col('Avg_stick_download') > 7.0,'High').otherwise(when(col('Avg_stick_download') > 3.0,'Medium').otherwise(when(col('Avg_stick_download') > 1.4,'low').otherwise('very low')))))


#get the percentiles for free,paid and subscribed among the clusters
#usage rate of free,paid and subscribed
main_data = main_data.withColumn('Freestickerusagerate',col('sum_free')/col('distinct_sticker_packs_download'))
main_data = main_data.withColumn('paidstickerusagerate',col('sum_paid')/col('distinct_sticker_packs_download'))
main_data = main_data.withColumn('subsstickerusagerate',col('sum_subs')/col('distinct_sticker_packs_download'))



# #calculate the top percent rank for users performing better in the same cluster 
window_freerank = Window.partitionBy('Abilty_to_pay').orderBy(main_data['Freestickerusagerate'].desc())
main_data = main_data.withColumn('Free_Stickers_rank',percent_rank().over(window_freerank).alias('rank'))

window_paidrank = Window.partitionBy('Abilty_to_pay').orderBy(main_data['paidstickerusagerate'].desc())
main_data = main_data.withColumn('Paid_Stickers_rank',percent_rank().over(window_freerank).alias('rank'))

window_subsrank = Window.partitionBy('Abilty_to_pay').orderBy(main_data['subsstickerusagerate'].desc())
main_data = main_data.withColumn('Subscribed_Stickers_rank',percent_rank().over(window_freerank).alias('rank'))


In [None]:
main_data.groupby('Free_sticker_usage').count().show()

In [199]:
main_data = main_data.withColumn('Free_sticker_usage',when(col('Free_Stickers_rank') <= 0.2,'High').otherwise(when( (col('Free_Stickers_rank') > 0.2) & (col('Free_Stickers_rank') < 0.5),'medium').otherwise('low')))
main_data = main_data.withColumn('paid_sticker_usage',when(col('Paid_Stickers_rank') <= 0.2,'High').otherwise(when( (col('Paid_Stickers_rank') > 0.2) & (col('Paid_Stickers_rank') < 0.5),'medium').otherwise('low')))
main_data = main_data.withColumn('Subscribed_sticker_usage',when(col('Subscribed_Stickers_rank') <= 0.2,'High').otherwise(when( (col('Subscribed_Stickers_rank') > 0.2) & (col('Subscribed_Stickers_rank') < 0.5),'medium').otherwise('low')))
main_data = main_data.withColumn('')



In [200]:
main_data.groupby('Abilty_to_pay','paid_sticker_usage').count().show()

+-------------+------------------+------+
|Abilty_to_pay|paid_sticker_usage| count|
+-------------+------------------+------+
|         High|              High| 77300|
|         High|            medium| 93278|
|         High|               low|167365|
|          low|              High|484992|
|    Very High|              High| 20894|
|    Very High|            medium| 31439|
|    Very High|               low| 52125|
|     very low|              High|736838|
|     very low|               low|151303|
|       Medium|              High|138066|
|       Medium|            medium|170949|
|       Medium|               low|297563|
+-------------+------------------+------+



In [202]:
main_data.select('avg_con_days').describe().show()

+-------+-------------------+
|summary|       avg_con_days|
+-------+-------------------+
|  count|            2422112|
|   mean|0.09567270217066759|
| stddev|0.34783412371311795|
|    min|                0.0|
|    max|               48.0|
+-------+-------------------+



In [None]:
main_data.columns