In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
sc.stop()
from pyspark.sql import SQLContext
conf = SparkConf().setAppName("App")
conf = (conf.set('spark.driver.memory', '60G').set("spark.yarn.executor.memoryOverhead", '4096'))
sc =SparkContext(conf=conf)
sqlContext = SQLContext(sc)
import pyspark.sql.functions as func
import sys
from pyspark.sql.functions import countDistinct

In [2]:
#reading in sticker type data
feature_data = sqlContext.read.parquet("gs://ds-url-catag/stick_statistics/derived_features/")


In [3]:
#convert pyspark to pandas dataframe
import pandas as pd
import pyarrow.parquet as pq
feature_data_df  = feature_data.toPandas()

In [4]:
#checking size of dataframe
print 'Size of the dataframe: {}'.format(feature_data_df.shape)

Size of the dataframe: (7495040, 9)


In [5]:
#logarithmic transformation of two different variables
import numpy as np
feature_data_df['log_noofdays']=np.log10(1+feature_data_df.numofdays)
feature_data_df['log_avg_con_days']=np.log10(1+feature_data_df.avg_con_days)
feature_data_df['log_sticker_packs_sent']=np.log10(1+feature_data_df.sticker_packs_sent)
feature_data_df['log_distinct_sticker_packs_sent']=np.log10(1+feature_data_df.distinct_sticker_packs_sent)
feature_data_df['log_sum_paid']=np.log10(1+feature_data_df.sum_paid)
feature_data_df['log_sum_free']=np.log10(1+feature_data_df.sum_free)
feature_data_df['log_sum_subs']=np.log10(1+feature_data_df.sum_subs)
feature_data_df['log_sum_discont']=np.log10(1+feature_data_df.sum_discont)


In [6]:
#computing PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
pca_result = pca.fit_transform(feature_data_df[['log_noofdays','log_avg_con_days','log_sticker_packs_sent','log_distinct_sticker_packs_sent','log_sum_paid','log_sum_free','log_sum_subs','log_sum_discont']].values)

feature_data_df['pca-one'] = pca_result[:,0]
feature_data_df['pca-two'] = pca_result[:,1] 
feature_data_df['pca-three'] = pca_result[:,2]
feature_data_df['pca-four'] = pca_result[:,3]
feature_data_df['pca-five'] = pca_result[:,4]
feature_data_df['pca-six'] = pca_result[:,5]
feature_data_df['pca-seven'] = pca_result[:,6]
feature_data_df['pca-eight'] = pca_result[:,7]


print 'Explained variation per principal component: {}'.format(pca.explained_variance_ratio_)


Explained variation per principal component: [  8.46172910e-01   6.66238711e-02   5.91211599e-02   1.92696980e-02
   4.88561740e-03   3.16695632e-03   7.46680208e-04   1.31073945e-05]


In [9]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=9)
model = kmeans.fit(feature_data_df[['pca-one','pca-two','pca-three','pca-four','pca-five','pca-six','pca-seven']])
y_kmeans = kmeans.predict(feature_data_df[['pca-one','pca-two','pca-three','pca-four','pca-five','pca-six','pca-seven']])
centers = kmeans.cluster_centers_

In [10]:
feature_data_df['clusters'] =  kmeans.labels_

In [13]:
feature_data_df

Unnamed: 0,user_id_n,numofdays,avg_con_days,sticker_packs_sent,distinct_sticker_packs_sent,sum_paid,sum_free,sum_subs,sum_discont,log_noofdays,...,log_sum_discont,pca-one,pca-two,pca-three,pca-four,pca-five,pca-six,pca-seven,pca-eight,clusters
0,100006037187,10,1.0,12,6,0,6,0,0,1.041393,...,0.00000,0.462635,-0.421081,-0.065984,-0.115809,-0.049006,0.007957,-0.000294,-7.658229e-06,7
1,100006265659,11,5.0,34,4,0,3,0,0,1.079181,...,0.00000,0.856933,-0.026602,-0.320885,0.221449,0.010511,-0.025816,-0.004131,-4.551636e-05,6
2,100006468862,5,0.0,6,1,0,0,0,1,0.778151,...,0.30103,-0.275810,0.346488,-0.206023,-0.264017,0.327451,0.130157,-0.018184,-3.296820e-05,5
3,100006620862,2,0.0,2,2,0,2,0,0,0.477121,...,0.00000,-0.540172,-0.166604,0.032574,-0.047862,-0.023205,0.006947,0.000677,1.161227e-05,5
4,100006626295,2,0.0,2,1,0,1,0,0,0.477121,...,0.00000,-0.626555,0.044453,-0.065705,-0.065234,-0.018127,0.011740,0.000940,2.600394e-05,1
5,100006915377,7,0.0,8,3,2,1,0,0,0.903090,...,0.00000,0.008528,0.024133,-0.081163,-0.308850,0.158392,-0.119429,0.465313,-4.415207e-03,3
6,100007210525,7,1.0,17,6,0,5,0,0,0.903090,...,0.00000,0.502657,-0.286960,0.051884,-0.026875,-0.010517,-0.018598,-0.002368,-3.082000e-05,7
7,100007397690,5,1.0,9,5,0,5,0,0,0.778151,...,0.00000,0.230330,-0.350786,0.021934,0.040823,-0.040645,0.006423,0.000032,-1.388719e-05,7
8,100007668849,12,2.0,36,6,0,5,0,0,1.113943,...,0.00000,0.886972,-0.186254,-0.071951,-0.014226,-0.016369,-0.013802,-0.002528,-2.986693e-05,7
9,100007871646,1,0.0,1,1,0,1,0,0,0.301030,...,0.00000,-0.839512,-0.011309,-0.016801,0.040195,-0.012300,0.007812,0.000901,1.733317e-05,1


In [16]:
original_data = sqlContext.createDataFrame(feature_data_df)

In [21]:
#distributionofclusters
data = original_data.groupby('clusters').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------+-------+------------------+
|clusters|    cnt|        percentage|
+--------+-------+------------------+
|       1|2190787| 29.22982399026556|
|       5|1617213|21.577109661856376|
|       3| 890646|11.883138715737342|
|       0| 745652| 9.948606011442234|
|       7| 661970|  8.83210763384852|
|       6| 490909| 6.549784924430023|
|       8| 436002| 5.817207112970711|
|       2| 247189| 3.298034433438647|
|       4| 214672|2.8641875160105883|
+--------+-------+------------------+



In [23]:
aggregated_demo_stick = sqlContext.read.parquet("gs://ds-url-catag/clustering/user_profile_data/")

In [37]:
aggregated_demo_stick.columns

['user_id_n',
 'city',
 'Country',
 'age_group',
 'gender',
 'Device_Brand',
 'Device_Model',
 'OS_version',
 'platform',
 'count']

In [39]:
new_data = original_data.join(aggregated_demo_stick,aggregated_demo_stick.user_id_n == original_data.user_id_n).select([original_data.user_id_n,original_data.clusters]+[aggregated_demo_stick.city,aggregated_demo_stick.Country,aggregated_demo_stick.age_group,aggregated_demo_stick.gender,aggregated_demo_stick.Device_Brand,aggregated_demo_stick.Device_Model,aggregated_demo_stick.OS_version,aggregated_demo_stick.platform])

In [47]:
cluster_0 = new_data.where(col('clusters')==0)

In [50]:
cluster_0.groupby('age_group').count().show()

+---------+------+
|age_group| count|
+---------+------+
|    45-54| 14482|
|    55-64|  2257|
| 18 under|176761|
|      75+|  4930|
|    35-44| 83615|
|    25-34|258467|
|    18-24|316745|
|    65-74|  1353|
|     NULL|    36|
+---------+------+



In [51]:
cluster_0.groupby('gender').count().show()

+------+------+
|gender| count|
+------+------+
|Female|370140|
|  Male|488502|
|  NULL|     4|
+------+------+



In [55]:
cluster_0.groupby('platform').count().show()

+-----------+------+
|   platform| count|
+-----------+------+
|app-android|858634|
|    app-ios|     5|
|       NULL|     7|
+-----------+------+



In [62]:
#distributionbycountry
data = cluster_0.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------------+------+--------------------+
|      country|   cnt|          percentage|
+-------------+------+--------------------+
|           ID|694288|   84.83148263997498|
|         NULL| 71205|   8.700173013762903|
|           SA| 12677|  1.5489374804504221|
|           OM|  3277| 0.40039978886456057|
|           CA|  3123|  0.3815833203002815|
|           MY|  2657|  0.3246451751642164|
|           AE|  2650| 0.32378988113856744|
|           US|  2281| 0.27870366750078196|
|           VE|  2121| 0.25915408977166093|
|           TW|  1874| 0.22897442915233032|
|           NG|  1767|  0.2159006490459806|
|           ZA|  1490| 0.18205544260243978|
|           GB|  1396| 0.17057006568658117|
|           IN|  1307| 0.15969561307475758|
|           HK|  1008| 0.12316233969346263|
|           EG|   872| 0.10654519862370972|
|           JB|   723| 0.08833965436346575|
|           SG|   678| 0.08284133562715046|
|           CO|   625|  0.0763655380043791|
|           JT|   583| 0.0712337

In [66]:
#distributionbycity
data = cluster_0.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|583408|    76.6703901804371|
|       Asia/Makassar|106201|  13.956737150592039|
|         Asia/Riyadh| 13266|  1.7433929533597046|
|   Asia/Kuala_Lumpur|  7157|  0.9405595784106291|
|       Asia/Jayapura|  7099|  0.9329373266923371|
|        Asia/Bangkok|  3433|  0.4511584508430473|
|         Asia/Muscat|  3280| 0.43105147648272507|
|     America/Toronto|  2297| 0.30186745167098156|
|         Asia/Taipei|  2071|  0.2721669535962572|
|      Asia/Hong_Kong|  2060|  0.2707213541324432|
|     America/Caracas|  1986| 0.26099641228496706|
|        Africa/Lagos|  1756| 0.23077024167794674|
| Africa/Johannesburg|  1482|  0.1947616732156703|
|       Asia/Shanghai|  1355|  0.1780715703152721|
|          Asia/Dubai|  1267| 0.16650677460475996|
|       Europe/London|  1192| 0.15665041462420987|
|    America/New_York|  1132| 0

In [64]:
#distributionbyDevice_Brand
data = cluster_0.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|247729|   32.56263004796412|
|                OPPO|135832|  17.854377826879624|
|              Xiaomi|123354|  16.214212574775523|
|                asus| 49303|   6.480611269793908|
|                vivo| 38608|   5.074811672802937|
|              LENOVO| 28155|  3.7008216599608033|
|               ADVAN| 22160|   2.912811507182788|
|               Haier| 15262|  2.0061069143783263|
|              HUAWEI| 14707|  1.9331551821361581|
|                Sony| 12350|  1.6233403480914905|
|             Hisense|  5613|  0.7377983298653876|
|                 LGE|  5548|  0.7292544332964851|
|            EVERCOSS|  3639|  0.4783267632959461|
|             Coolpad|  3535| 0.46465652878570196|
|            POLYTRON|  3070| 0.40353480717739887|
|                LAVA|  3034| 0.39880280292385284|
|               HIMAX|  2562|  

In [65]:
#distributionbyDevice_Brand
data = cluster_0.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|46914|   6.118911445732062|
|            Redmi 4A|25548|  3.3321812170261054|
|            SM-G532G|23727|  3.0946713533888524|
|               A1601|21308|   2.779165389556609|
|            Redmi 4X|20203|  2.6350421609354315|
|        Redmi Note 4|14593|  1.9033396156279143|
|            SM-J111F|13460|  1.7555643956932587|
|            SM-J500G|13446|  1.7537384000365202|
|            SM-G610F|12628|  1.6470480823784897|
|        Redmi Note 3|11543|  1.5055334189812248|
|             CPH1701|11325|  1.4771000580405762|
|                A33w|10111|  1.3187601489490743|
|            SM-J200G|10104|  1.3178471511207048|
|            Redmi 3S| 9844|  1.2839358032098396|
|            SM-J320G| 9619|  1.2545894444408214|
|            SM-G530H| 9611|  1.2535460183512563|
|     Andromax A16C3H| 9600|  1.2521113074781043|


In [67]:
cluster_1 = new_data.where(col('clusters')==1)

In [68]:
#distributionbycountry
data = cluster_1.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------------+-------+--------------------+
|      country|    cnt|          percentage|
+-------------+-------+--------------------+
|           ID|1920780|   86.96356891192085|
|         NULL|  76056|  3.4434454737997333|
|           SA|  65332|   2.957914953380196|
|           NG|  15134|  0.6851938545346214|
|           OM|  14012|  0.6343951559230286|
|           CA|  13278|   0.601163208702967|
|           MY|   9345|  0.4230961127676779|
|           AE|   8730|  0.3952519063094519|
|           VE|   8190|  0.3708033347851559|
|           US|   7321|  0.3314592446840203|
|           ZA|   4783| 0.21655095851982917|
|           TW|   4248| 0.19232876265779514|
|           GB|   3897| 0.17643719116700274|
|           CO|   3476| 0.15737636040454236|
|           EG|   3400| 0.15393545033815995|
|           IN|   3180| 0.14397492119863198|
|           HK|   2548| 0.11536103748871517|
|           MX|   1974| 0.08937311146103757|
|           BH|   1967| 0.08905618553387078|
|         

In [69]:
#distributionbycountry
data = cluster_1.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-------+--------------------+
|                city|    cnt|          percentage|
+--------------------+-------+--------------------+
|        Asia/Jakarta|1672799|   76.22608648895455|
|       Asia/Makassar| 258771|  11.791674090451547|
|         Asia/Riyadh|  69369|  3.1610058313355567|
|   Asia/Kuala_Lumpur|  20555|  0.9366500146045404|
|       Asia/Jayapura|  19384|  0.8832898994451187|
|        Africa/Lagos|  15369|  0.7003344234715244|
|         Asia/Muscat|  14274|  0.6504374754787259|
|     America/Toronto|  10108|  0.4606012331609193|
|        Asia/Bangkok|   9323|  0.4248303617688218|
|     America/Caracas|   7835|  0.3570251940854573|
|      Asia/Hong_Kong|   5378|  0.2450646450276438|
| Africa/Johannesburg|   4854|  0.2211870187735558|
|         Asia/Taipei|   4813| 0.21931873122314052|
|          Asia/Dubai|   4653| 0.21202785297761745|
|    America/New_York|   3790| 0.17270267844082746|
|      America/Bogota|   3722| 0.16960405518648017|
|       Euro

In [70]:
#distributionbycountry
data = cluster_1.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|756778|   34.51731594524687|
|                OPPO|367163|  16.746630153631152|
|              Xiaomi|356824|  16.275059191528783|
|                asus|139088|   6.343927070015905|
|                vivo|106239|   4.845655038475064|
|              LENOVO| 81032|   3.695941406429949|
|              HUAWEI| 50071|  2.2837827298024727|
|               ADVAN| 48819|  2.2266778991078056|
|                Sony| 40226|   1.834743545945443|
|               Haier| 39229|   1.789269491470536|
|                 LGE| 18118|  0.8263780531357713|
|             Hisense| 15820|  0.7215642344965174|
|             Coolpad|  9375|  0.4276020669029615|
|            EVERCOSS|  9146|  0.4171571737487451|
|             Infinix|  9096|  0.4148766293919293|
|                LAVA|  7754|  0.3536668188549934|
|            POLYTRON|  7549| 0

In [71]:
#distributionbycountry
data = cluster_1.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Model|   cnt|          percentage|
+--------------------+------+--------------------+
|                A37f|118327|      5.395362961693|
|            Redmi 4A| 67256|  3.0666756644859112|
|            SM-G532G| 60561|   2.761403367981017|
|               A1601| 58718|   2.677367991960327|
|            Redmi 4X| 57945|  2.6421214669120396|
|        Redmi Note 4| 45114|  2.0570656287560576|
|            SM-G610F| 38651|  1.7623718494713478|
|            SM-J500G| 37053|  1.6895077524116282|
|        Redmi Note 3| 33255|    1.51633012998809|
|            SM-J111F| 32836|   1.497224963111981|
|             CPH1701| 32223|  1.4692739671810622|
|            Redmi 3S| 27247|  1.2423830116308974|
|            SM-G530H| 27038|  1.2328532267213346|
|             Redmi 3| 25719|  1.1727107085600268|
|                A33w| 25711|  1.1723459321041583|
|            SM-J200G| 25647|  1.1694277204572108|
|            SM-J320G| 25200|  

In [72]:
cluster_2 = new_data.where(col('clusters')==2)

In [73]:
#distributionbycountry
data = cluster_2.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+---------+------+--------------------+
|  country|   cnt|          percentage|
+---------+------+--------------------+
|       ID|211786|   63.51833245857389|
|     NULL| 76090|  22.820724300817275|
|       SA|  9480|    2.84321811501837|
|       OM|  6880|  2.0634325560470868|
|       AE|  2931|  0.8790582589787809|
|       US|  1925|  0.5773412311614305|
|       ZA|  1793|  0.5377521181675039|
|       IN|  1584|  0.4750693559271201|
|       TW|  1544| 0.46307265501986955|
|       HN|  1413|  0.4237834595486241|
|       MY|  1398|  0.4192846967084052|
|       HK|  1350| 0.40488865561970455|
|       GB|  1042| 0.31251405863387566|
|       CA|  1018|  0.3053160380895254|
|       EG|   865|  0.2594286571192922|
|       SG|   758|  0.2273374821923971|
|       VE|   672| 0.20154457524180852|
|       CO|   484| 0.14516008097773114|
|       MX|   473| 0.14186098822823723|
|       AU|   438| 0.13136387493439305|
|       BH|   351| 0.10527105046112319|
|       LB|   324|  0.0971732773487291|


In [74]:
#distributionbycountry
data = cluster_2.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|174284|   65.77400046797044|
|       Asia/Makassar| 40460|  15.269422660336485|
|         Asia/Riyadh|  9510|   3.589031376663371|
|         Asia/Muscat|  6790|  2.5625155675651197|
|   Asia/Kuala_Lumpur|  3313|   1.250311351302392|
|       Asia/Jayapura|  2001|  0.7551684316197061|
|      Asia/Hong_Kong|  1855|  0.7000686859842852|
| Africa/Johannesburg|  1770|  0.6679900669499649|
|          Asia/Dubai|  1744|  0.6581777834806434|
|         Asia/Taipei|  1615|  0.6094937616520867|
| America/Tegucigalpa|  1397|  0.5272215387170062|
|        Asia/Bangkok|  1302| 0.49136896450217754|
|       Asia/Calcutta|  1218|   0.459667740985908|
|      Asia/Singapore|   899|   0.339278570727694|
|     America/Toronto|   769| 0.29021715338108645|
|       Europe/London|   751| 0.28342403405617156|
|    Antarctica/Davis|   719|  

In [75]:
#distributionbycountry
data = cluster_2.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())


+--------------------+-----+--------------------+
|        Device_Brand|  cnt|          percentage|
+--------------------+-----+--------------------+
|             samsung|88952|   34.64969343793579|
|                OPPO|46058|  17.941087107253875|
|              Xiaomi|43559|  16.967645431952572|
|                asus|15965|   6.218886092911288|
|                vivo|13113|   5.107939451070824|
|              HUAWEI| 8473|  3.3005087294229463|
|              LENOVO| 7780|  3.0305627186251063|
|                Sony| 5215|  2.0314118994382944|
|               ADVAN| 4170|  1.6243504545844076|
|               Haier| 2549|  0.9929182994569917|
|                 LGE| 2458|   0.957470843493639|
|             Hisense| 1391|  0.5418396840112497|
|                 HTC| 1124|   0.437834511019874|
|             Coolpad| 1107|  0.4312124588069399|
|                LAVA|  795| 0.30967832407544466|
|            EVERCOSS|  747| 0.29098076488598384|
|            POLYTRON|  646|  0.2516379840914934|


In [76]:
#distributionbycountry
data = cluster_2.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Brand|  cnt|          percentage|
+--------------------+-----+--------------------+
|             samsung|88952|   34.64969343793579|
|                OPPO|46058|  17.941087107253875|
|              Xiaomi|43559|  16.967645431952572|
|                asus|15965|   6.218886092911288|
|                vivo|13113|   5.107939451070824|
|              HUAWEI| 8473|  3.3005087294229463|
|              LENOVO| 7780|  3.0305627186251063|
|                Sony| 5215|  2.0314118994382944|
|               ADVAN| 4170|  1.6243504545844076|
|               Haier| 2549|  0.9929182994569917|
|                 LGE| 2458|   0.957470843493639|
|             Hisense| 1391|  0.5418396840112497|
|                 HTC| 1124|   0.437834511019874|
|             Coolpad| 1107|  0.4312124588069399|
|                LAVA|  795| 0.30967832407544466|
|            EVERCOSS|  747| 0.29098076488598384|
|            POLYTRON|  646|  0.2516379840914934|


In [77]:
cluster_3 = new_data.where(col('clusters')==3)

In [78]:
#distributionbycountry
data = cluster_3.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+---------+------+--------------------+
|  country|   cnt|          percentage|
+---------+------+--------------------+
|       ID|796069|   79.67454368759076|
|     NULL| 96731|   9.681319440204735|
|       SA| 32028|   3.205521487743094|
|       OM| 10891|   1.090025431591421|
|       AE|  5208|  0.5212425349121405|
|       CA|  4611| 0.46149180654375566|
|       MY|  4131|  0.4134510199159086|
|       US|  3580| 0.35830420026602583|
|       VE|  3093| 0.30956281883318937|
|       NG|  3033| 0.30355772050470853|
|       TW|  2718| 0.27203095428018387|
|       ZA|  2539| 0.25411574426688255|
|       GB|  1947| 0.19486544075920456|
|       EG|  1903| 0.19046170198498527|
|       HK|  1892| 0.18936076729143042|
|       CO|  1843| 0.18445660365650438|
|       IN|  1721|  0.1722462370552599|
|       SG|  1232| 0.12330468567814074|
|       MX|  1056| 0.10568973058126349|
|       BH|   976| 0.09768293280995564|
|       JB|   970| 0.09708242297710756|
|       JT|   857| 0.08577282112513525|


In [79]:
#distributionbycountry
data = cluster_3.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|674283|   73.49261676497902|
|       Asia/Makassar|118617|  12.928508835031455|
|         Asia/Riyadh| 32566|  3.5494896913733642|
|         Asia/Muscat| 10802|   1.177350231720662|
|   Asia/Kuala_Lumpur|  9589|   1.045140841693152|
|       Asia/Jayapura|  7035|  0.7667708646690297|
|        Asia/Bangkok|  3994|  0.4353209429265251|
|     America/Toronto|  3466|  0.3777722554289775|
|      Asia/Hong_Kong|  3278| 0.35728143488060826|
|         Asia/Taipei|  2990| 0.32589124170012773|
|        Africa/Lagos|  2971|  0.3238203609000266|
|     America/Caracas|  2886|   0.314555894162732|
| Africa/Johannesburg|  2509|  0.2734652593396724|
|          Asia/Dubai|  2490| 0.27139437853957127|
|      America/Bogota|  1975|  0.2152626094841981|
|    America/New_York|  1724| 0.18790518417759874|
|      Asia/Singapore|  1628|  

In [80]:
#distributionbycountry
data = cluster_3.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|311294|  34.268572592319416|
|                OPPO|160130|  17.627794076365458|
|              Xiaomi|151356|  16.661914695699558|
|                asus| 56816|   6.254547856384061|
|                vivo| 45964|   5.059913363679898|
|              LENOVO| 31457|  3.4629208659228636|
|              HUAWEI| 23791|    2.61901485587217|
|               ADVAN| 17671|  1.9452991264813215|
|                Sony| 17020|  1.8736342670314126|
|               Haier| 13411|  1.4763401383759267|
|                 LGE|  7737|  0.8517219931857837|
|             Hisense|  5908|  0.6503778642550873|
|             Coolpad|  3872|  0.4262462915361709|
|            EVERCOSS|  3461|  0.3810016567682561|
|                LAVA|  3202| 0.35248983096560416|
|                 HTC|  2946| 0.32430825797147717|
|             Infinix|  2885| 0

In [81]:
#distributionbycountry
data = cluster_3.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|51361|   5.606618650376933|
|            Redmi 4A|28894|   3.154098231809955|
|               A1601|26771|   2.922349406928231|
|            SM-G532G|26513|  2.8941858662690296|
|            Redmi 4X|25038|    2.73317337606623|
|        Redmi Note 4|19823|  2.1638987073153158|
|            SM-G610F|17568|    1.91774062907307|
|            SM-J500G|16215|  1.7700457821277227|
|             CPH1701|14684|  1.6029202753477325|
|        Redmi Note 3|14650|  1.5992088009972951|
|            SM-J111F|12973|  1.4161457867124851|
|            SM-G530H|11903|  1.2993435056840137|
|            Redmi 3S|11197|  1.2222758324072842|
|             Redmi 3|11157|  1.2179093919950048|
|                A33w|11020|  1.2029543335829482|
|           vivo 1606|10769|  1.1755549199958957|
|            SM-J200G|10733|   1.171625123624844|


In [82]:
cluster_4 = new_data.where(col('clusters')==4)

In [83]:
#distributionbycountry
data = cluster_4.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------+------+--------------------+
|country|   cnt|          percentage|
+-------+------+--------------------+
|     ID|204935|   84.32845033330591|
|   NULL| 28198|   11.60316023372562|
|     SA|  3487|  1.4348613282857379|
|     AE|   908| 0.37363179985186407|
|     OM|   700|  0.2880421364496749|
|     MY|   370| 0.15225084355197102|
|     EG|   369| 0.15183935478561436|
|     US|   337| 0.13867171426220065|
|     IN|   273| 0.11233643321537322|
|     VE|   232|  0.0954653937947494|
|     GB|   211| 0.08682412970125916|
|     CA|   210| 0.08641264093490247|
|     ZA|   185| 0.07612542177598551|
|     TW|   153|  0.0629577812525718|
|     JB|   115|0.047321208131018026|
|     JT|   114|0.046909719364661345|
|     CO|   106| 0.04361780923380792|
|     SG|    96| 0.03950292157024113|
|     NG|    84| 0.03456505637396099|
|     HN|    82| 0.03374207884124763|
|     HK|    78| 0.03209612377582092|
|     BH|    76|0.031273146243107564|
|     JO|    74|0.030450168710394205|
|     JI|   

In [84]:
#distributionbycountry
data = cluster_4.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|153807|    69.6715890559884|
|       Asia/Makassar| 50056|  22.674397535785467|
|         Asia/Riyadh|  3753|  1.7000362384489944|
|       Asia/Jayapura|  3096|  1.4024279760826235|
|   Asia/Kuala_Lumpur|  1756|  0.7954339554267078|
|        Asia/Bangkok|   704| 0.31889835115057075|
|         Asia/Muscat|   704| 0.31889835115057075|
|       Asia/Shanghai|   437| 0.19795252763181734|
|    Antarctica/Davis|   369| 0.16714984598659177|
|      Asia/Hong_Kong|   340|  0.1540134082261279|
|      Europe/Andorra|   275|  0.1245696684181917|
|          Asia/Dubai|   270| 0.12230476535604276|
|     America/Caracas|   250|   0.113245153107447|
|         Asia/Taipei|   230| 0.10418554085885123|
|       Asia/Calcutta|   228| 0.10327957963399167|
|    America/New_York|   193| 0.08742525819894909|
| Africa/Johannesburg|   182| 0

In [85]:
#distributionbycountry
data = cluster_4.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Brand|  cnt|          percentage|
+--------------------+-----+--------------------+
|             samsung|69742|  32.077233360469876|
|                OPPO|37467|  17.232624563630594|
|              Xiaomi|30235|  13.906328333770276|
|                asus|15693|    7.21786044457936|
|                vivo|11228|   5.164222078107249|
|               ADVAN|10049|    4.62195116342178|
|              LENOVO| 7315|   3.364471366347927|
|               Haier| 5220|  2.4008941260883363|
|              HUAWEI| 4501|  2.0701962569968586|
|                Sony| 2520|  1.1590523367323005|
|             Hisense| 1607|  0.7391258353685741|
|            EVERCOSS| 1516|  0.6972711676532409|
|            POLYTRON| 1319|  0.6066627111705969|
|                LAVA| 1150|  0.5289326139849784|
|             Coolpad| 1119|  0.5146744304775572|
|               HIMAX|  925|  0.4254457982053086|
|                alps|  923| 0.42452591539837825|


In [86]:
#distributionbycountry
data = cluster_4.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|15045|    6.89056617599912|
|            SM-G532G| 9410|   4.309752589973527|
|            Redmi 4A| 9158|    4.19433732401462|
|            SM-J111F| 6529|   2.990262981927435|
|            Redmi 4X| 6005|  2.7502725082668475|
|          ASUS_X014D| 4139|   1.895649943666358|
|               A1601| 3921|  1.7958065786701596|
|           vivo 1606| 3576|  1.6377975836073682|
|     Andromax A16C3H| 3377|  1.5466561632668014|
|            SM-J200G| 3101|  1.4202489672165686|
|             CPH1701| 3093|  1.4165849905194603|
|            SM-J500G| 3048|  1.3959751215982266|
|                A33w| 2885|   1.321321596394647|
|           SM-G313HZ| 2867|  1.3130776488261533|
|        Redmi Note 4| 2759|  1.2636139634151926|
|           ASUS_Z007| 2703|  1.2379661265354354|
|            SM-J320G| 2663|  1.2196462430498942|


In [87]:
cluster_5 = new_data.where(col('clusters')==5)

In [88]:
#distributionbycountry
data = cluster_5.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------------+-------+--------------------+
|      country|    cnt|          percentage|
+-------------+-------+--------------------+
|           ID|1464566|   85.30670587084057|
|         NULL| 103330|   6.018671686789094|
|           SA|  43126|  2.5119639520416768|
|           OM|  10931|  0.6366989277875892|
|           CA|   8388|  0.4885765809424845|
|           NG|   7358| 0.42858207946766824|
|           MY|   6958|  0.4052832439434677|
|           AE|   6541|  0.3809942079094887|
|           US|   5477| 0.31901930541511536|
|           VE|   5160| 0.30055497826218647|
|           TW|   4149| 0.24166717147476968|
|           ZA|   3478| 0.20258337488292333|
|           GB|   2919| 0.17002325223785317|
|           IN|   2410| 0.14037548403330802|
|           EG|   2317|  0.1349585047739314|
|           HK|   2287| 0.13321109210961635|
|           CO|   2214| 0.12895905462644977|
|           JB|   1638| 0.09540873147160105|
|           SG|   1572| 0.09156442361010796|
|         

In [89]:
#distributionbycountry
data = cluster_5.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-------+--------------------+
|                city|    cnt|          percentage|
+--------------------+-------+--------------------+
|        Asia/Jakarta|1249360|   76.12659574532907|
|       Asia/Makassar| 206196|   12.56403241363888|
|         Asia/Riyadh|  44716|  2.7246565084108143|
|   Asia/Kuala_Lumpur|  16235|  0.9892387157628045|
|       Asia/Jayapura|  14157|  0.8626210347430875|
|         Asia/Muscat|  10991|  0.6697088219863865|
|        Asia/Bangkok|   7419| 0.45205802477636264|
|        Africa/Lagos|   7274| 0.44322281604303293|
|     America/Toronto|   6317| 0.38491043840305733|
|     America/Caracas|   4844| 0.29515690416723284|
|      Asia/Hong_Kong|   4809| 0.29302426757642913|
|         Asia/Taipei|   4571|  0.2785223387589639|
| Africa/Johannesburg|   3467| 0.21125288743761278|
|          Asia/Dubai|   3269|  0.1991882575810661|
|    America/New_York|   2653| 0.16165385358292086|
|       Asia/Shanghai|   2563| 0.15616993092085418|
|       Euro

In [90]:
#distributionbycountry
data = cluster_5.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|553058|  33.742924185831406|
|                OPPO|287376|  17.533254343717093|
|              Xiaomi|269792|  16.460427300470887|
|                asus|104752|   6.391081576099092|
|                vivo| 81648|   4.981470793162314|
|              LENOVO| 59905|   3.654896725754316|
|               ADVAN| 37493|  2.2875059333729504|
|              HUAWEI| 35974|   2.194829393410997|
|                Sony| 29404|  1.7939835293227597|
|               Haier| 28794|  1.7567664856250695|
|                 LGE| 13142|  0.8018137512705654|
|             Hisense| 11584|  0.7067577609738419|
|             Coolpad|  7083|  0.4321447877225244|
|            EVERCOSS|  6604| 0.40292025668778075|
|                LAVA|  5897|  0.3597850929266873|
|             Infinix|  5854| 0.35716159640373535|
|            POLYTRON|  5776| 0

In [91]:
#distributionbycountry
data = cluster_5.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|93891|   5.699136853095068|
|            Redmi 4A|51227|   3.109453340293543|
|            SM-G532G|47425|  2.8786738373010574|
|               A1601|46981|  2.8517232588348125|
|            Redmi 4X|43914|  2.6655578891141514|
|        Redmi Note 4|33965|   2.061658553166693|
|            SM-G610F|29426|   1.786143517900283|
|            SM-J500G|28498|  1.7298143809257889|
|        Redmi Note 3|25573|  1.5522683403542423|
|            SM-J111F|25417|  1.5427992181904264|
|             CPH1701|25126|   1.525135663384847|
|            Redmi 3S|20862|  1.2663129909072148|
|            SM-G530H|20824|   1.264006409867311|
|                A33w|20364|  1.2360846393842642|
|            SM-J200G|20308|  1.2326854673254586|
|             Redmi 3|20155|  1.2233984436647931|
|            SM-J320G|19356|  1.1748995423257622|


In [92]:
cluster_6 = new_data.where(col('clusters')==6)

In [93]:
#distributionbycountry
data = cluster_6.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------+------+--------------------+
|country|   cnt|          percentage|
+-------+------+--------------------+
|     ID|423822|   70.85179903741509|
|   NULL| 93706|  15.665158204623683|
|     SA| 25007|   4.180507237775857|
|     OM| 10439|  1.7451239674947883|
|     AE|  4888|  0.8171439748169869|
|     US|  2529| 0.42278173328808505|
|     MY|  2399|  0.4010491807663567|
|     CA|  2105| 0.35190017737106327|
|     VE|  2016| 0.33702173756772613|
|     ZA|  1957| 0.32715850219248016|
|     TW|  1745|  0.2917177242339693|
|     EG|  1733| 0.28971164246273284|
|     IN|  1595|  0.2666417020935135|
|     HK|  1543| 0.25794868108482216|
|     CO|  1458| 0.24373893520523052|
|     NG|  1226|  0.2049546876279922|
|     GB|  1211| 0.20244708541394663|
|     MX|   868|   0.145106581452771|
|     SG|   863|  0.1442707140480891|
|     BH|   830| 0.13875398917718884|
|     LB|   615| 0.10281169077586885|
|     KW|   575| 0.09612475153841396|
|     DE|   572| 0.09562323109560485|
|     JT|   

In [94]:
#distributionbycountry
data = cluster_6.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|357388|   69.36663457656594|
|       Asia/Makassar| 68316|   13.25968137635477|
|         Asia/Riyadh| 25175|   4.886300114903264|
|         Asia/Muscat| 10319|   2.002849290394708|
|   Asia/Kuala_Lumpur|  5496|  1.0667370578553461|
|       Asia/Jayapura|  3488|   0.676997608769914|
|      Asia/Hong_Kong|  2415| 0.46873544299866465|
|          Asia/Dubai|  2323|  0.4508788546939536|
|        Asia/Bangkok|  2315|  0.4493261078848483|
| Africa/Johannesburg|  1926|  0.3738237942921027|
|     America/Caracas|  1888| 0.36644824694885253|
|         Asia/Taipei|  1881|  0.3650895934908854|
|     America/Toronto|  1578| 0.30627930809602183|
|      America/Bogota|  1549| 0.30065060091301515|
|       Asia/Calcutta|  1223| 0.23737616844197387|
|        Africa/Lagos|  1205|  0.2338824881214869|
|    Antarctica/Davis|  1126|  

In [95]:
#distributionbycountry
data = cluster_6.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|176076|   34.97818779947078|
|                OPPO| 89551|  17.789657282255437|
|              Xiaomi| 83490|   16.58561586688598|
|                asus| 30299|   6.019015153321097|
|                vivo| 24954|   4.957209945409902|
|              HUAWEI| 16663|   3.310170286141108|
|              LENOVO| 16014|  3.1812438913919285|
|                Sony|  9501|  1.8874109037164175|
|               ADVAN|  8370|  1.6627333190302511|
|               Haier|  6561|  1.3033683758850032|
|                 LGE|  4512|  0.8963264916922931|
|             Hisense|  2848|  0.5657663670965538|
|                 HTC|  2194|  0.4358467027422187|
|             Coolpad|  2122|    0.42154362042798|
|            EVERCOSS|  1714|  0.3404928206472939|
|                LAVA|  1590|  0.3158597344394384|
|            POLYTRON|  1388| 0

In [96]:
#distributionbycountry
data = cluster_6.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|28700|   5.632553411129297|
|            Redmi 4A|16027|  3.1453983804937025|
|               A1601|15238|  2.9905522257417503|
|            SM-G532G|14391|  2.8243232104376905|
|            Redmi 4X|14000|  2.7475870298191696|
|        Redmi Note 4|11135|  2.1853129697883182|
|            SM-G610F|10741|    2.10798802052055|
|            SM-J500G| 8949|  1.7562968807036963|
|             CPH1701| 8754|  1.7180269185026438|
|        Redmi Note 3| 8206|  1.6104785119068648|
|             CPH1609| 6467|  1.2691889515600407|
|            SM-J111F| 6426|   1.261142446686999|
|            SM-G530H| 6386|  1.2532921980303726|
|             Redmi 3| 6331|  1.2424981061275115|
|            Redmi 3S| 6160|  1.2089382931204344|
|           vivo 1606| 5903|  1.1585004455016112|
|                A33w| 5800|  1.1382860552107987|


In [97]:
cluster_7 = new_data.where(col('clusters')==7)

In [98]:
#distributionbycountry
data = cluster_7.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------------+------+--------------------+
|      country|   cnt|          percentage|
+-------------+------+--------------------+
|           ID|618114|    80.4223345498546|
|         NULL| 96955|  12.614740074292369|
|           SA|  8672|  1.1283072139060741|
|           CA|  3999|  0.5203067975565487|
|           OM|  3923|   0.510418496327667|
|           US|  2981| 0.38785560478021297|
|           MY|  2643|  0.3438786861570288|
|           TW|  2638|  0.3432281400235498|
|           AE|  2099|  0.2730992668345076|
|           GB|  2059| 0.26789489776667513|
|           ZA|  2052| 0.26698413317980446|
|           HK|  1517| 0.19737569689754547|
|           VE|  1387| 0.18046149742709006|
|           IN|  1306| 0.16992265006472934|
|           NG|  1229|  0.1599042396091519|
|           AU|  1131|  0.1471535353929624|
|           SG|   988|  0.1285479159754614|
|           JB|   802| 0.10434759981004053|
|           JT|   710|  0.0923775509540259|
|           DE|   671| 0.0873032

In [99]:
#distributionbycountry
data = cluster_7.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|521111|   76.02398986951752|
|       Asia/Makassar| 94723|  13.818975980952825|
|         Asia/Riyadh|  8849|  1.2909654303120843|
|   Asia/Kuala_Lumpur|  7538|  1.0997058892182723|
|       Asia/Jayapura|  5533|  0.8071998786209473|
|         Asia/Muscat|  3922|  0.5721738521509769|
|        Asia/Bangkok|  3579|  0.5221341705375692|
|     America/Toronto|  2965|  0.4325587638010317|
|         Asia/Taipei|  2812|   0.410237856259191|
|      Asia/Hong_Kong|  2776|   0.404985878014052|
| Africa/Johannesburg|  2040|  0.2976121005578768|
|       Europe/London|  1792| 0.26143180598025256|
|    America/New_York|  1333|  0.1944690833547303|
|      Asia/Singapore|  1332|  0.1943231950701431|
|     America/Caracas|  1279| 0.18659111598702177|
|    Antarctica/Davis|  1230| 0.17944259004224924|
|       Asia/Shanghai|  1220| 0

In [100]:
#distributionbycountry
data = cluster_7.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|222247|  32.417983946135095|
|                OPPO|128978|  18.813332613734325|
|              Xiaomi|119344|   17.40807244222665|
|                asus| 45153|  6.5862271667101835|
|                vivo| 35698|    5.20707677003123|
|              LENOVO| 24830|  3.6218196033356334|
|               ADVAN| 14179|  2.0682150686949634|
|              HUAWEI| 12634|  1.8428541630504385|
|                Sony| 12250|  1.7868421321329642|
|               Haier| 10800|  1.5753383695539604|
|                 LGE|  5862|  0.8550586594745663|
|             Hisense|  4833|  0.7049639203753972|
|             Coolpad|  3377|  0.4925849698133078|
|            EVERCOSS|  2499| 0.36451579495512476|
|                LAVA|  2420|  0.3529924865111652|
|            POLYTRON|  2289| 0.33388421554713105|
|             Infinix|  2094| 0

In [101]:
#distributionbycountry
data = cluster_7.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|43296|   6.221234907082374|
|               A1601|22921|  3.2935357840270485|
|            Redmi 4A|22524|  3.2364905544882525|
|            SM-G532G|20411|  2.9328719902175333|
|            Redmi 4X|18288|  2.6278165184017563|
|        Redmi Note 4|15246|   2.190709243195165|
|            SM-J500G|14088|   2.024315349477469|
|            SM-G610F|13547|    1.94657865128984|
|        Redmi Note 3|12534|   1.801019916975482|
|             CPH1701|10692|  1.5363415471758302|
|            SM-J111F|10114|  1.4532882910714875|
|            Redmi 3S| 9821|   1.411186900001293|
|            SM-J200G| 9509|   1.366355384595489|
|            SM-G530H| 9273|  1.3324443665321242|
|             Redmi 3| 9246|  1.3285647161604681|
|                A33w| 9170|  1.3176442188180286|
|            SM-J320G| 8838|  1.2699388883221088|


In [104]:
cluster_8 = new_data.where(col('clusters')==8)

In [105]:
#distributionbycountry
data = cluster_8.groupby('country').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+-------+------+--------------------+
|country|   cnt|          percentage|
+-------+------+--------------------+
|     ID|397784|    72.8929284784959|
|   NULL| 98710|  18.088361950486522|
|     SA|  7507|  1.3756390756995474|
|     OM|  4785|  0.8768393469058657|
|     US|  2863|  0.5246376280441993|
|     CA|  2711| 0.49678400615711643|
|     ZA|  2380| 0.43612907954774516|
|     TW|  2261| 0.41432262557035787|
|     AE|  2204|  0.4038775173627018|
|     MY|  1963|  0.3597148668706822|
|     GB|  1932| 0.35403419398581665|
|     HK|  1622| 0.29722746513716075|
|     IN|  1573| 0.28824833702882485|
|     VE|  1100| 0.20157226365652087|
|     AU|  1074| 0.19680782833373037|
|     SG|  1001|   0.183430759927434|
|     NG|   649| 0.11892763555734732|
|     DE|   571| 0.10463432958897584|
|     EG|   571| 0.10463432958897584|
|     JB|   545| 0.09986989426618534|
|     HN|   537| 0.09840391416686518|
|     JT|   521| 0.09547195396822487|
|     MX|   477| 0.08740906342196404|
|  31030|   

In [106]:
#distributionbycountry
data = cluster_8.groupby('city').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|                city|   cnt|          percentage|
+--------------------+------+--------------------+
|        Asia/Jakarta|333182|    72.4910686700695|
|       Asia/Makassar| 66662|  14.503783576796383|
|         Asia/Riyadh|  7560|  1.6448441967024792|
|   Asia/Kuala_Lumpur|  5480|  1.1922944706256065|
|         Asia/Muscat|  4699|  1.0223707513630884|
|       Asia/Jayapura|  3346|  0.7279958574294305|
|      Asia/Hong_Kong|  2556|  0.5561139903136952|
|         Asia/Taipei|  2384|   0.518691609118877|
|        Asia/Bangkok|  2381|  0.5180388931678046|
| Africa/Johannesburg|  2351|  0.5115117336570805|
|     America/Toronto|  2009| 0.43710211523482545|
|       Europe/London|  1641| 0.35703562523660953|
|          Asia/Dubai|  1351|  0.2939397499662763|
|    America/New_York|  1254| 0.27283526754826837|
|      Asia/Singapore|  1237| 0.26913654382552465|
|       Asia/Calcutta|  1193|  0.2595633765431293|
|    Antarctica/Davis|  1070| 0

In [107]:
#distributionbycountry
data = cluster_8.groupby('Device_Brand').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+------+--------------------+
|        Device_Brand|   cnt|          percentage|
+--------------------+------+--------------------+
|             samsung|148602|   32.70643181152897|
|                OPPO| 86282|  18.990163992155846|
|              Xiaomi| 80199|   17.65133123950426|
|                asus| 29294|  6.4474382140679785|
|                vivo| 23998|   5.281819562408799|
|              LENOVO| 15304|   3.368320967709986|
|              HUAWEI|  9655|  2.1250090788839464|
|                Sony|  8551|  1.8820251303507642|
|               ADVAN|  8083|  1.7790210652116976|
|               Haier|  5963|  1.3124214538979777|
|                 LGE|  4338|  0.9547684499428855|
|             Hisense|  2802|  0.6167038258967186|
|             Coolpad|  2303|  0.5068768419129703|
|                LAVA|  1495|  0.3290407636386846|
|            EVERCOSS|  1486|  0.3270599162321641|
|            POLYTRON|  1291| 0.28414155575755307|
|                 HTC|  1228|  

In [108]:
#distributionbycountry
data = cluster_8.groupby('Device_Model').agg(countDistinct('user_id_n').alias('cnt'))
total = data.select("cnt").agg({"cnt": "sum"}).collect().pop()['sum(cnt)']
data = data.withColumn("percentage", (data['cnt']/total) * 100)
data.sort(col('percentage').desc()).show(data.count())

+--------------------+-----+--------------------+
|        Device_Model|  cnt|          percentage|
+--------------------+-----+--------------------+
|                A37f|28000|  6.0433524202547275|
|               A1601|16502|   3.561692915680125|
|            Redmi 4A|14999|   3.237294391121452|
|            SM-G532G|13479|   2.909226688307624|
|            Redmi 4X|12376|  2.6711617697525893|
|        Redmi Note 4|10649|   2.298416425831878|
|            SM-G610F|10138|  2.1881252441622294|
|            SM-J500G| 9289|   2.004882165419506|
|        Redmi Note 3| 8654|   1.867827565888729|
|             CPH1701| 7738|  1.6701236081403958|
|            Redmi 3S| 6559|  1.4156553044446698|
|             Redmi 3| 6182|    1.33428588078624|
|            SM-J111F| 5954|  1.2850757253641658|
|             CPH1609| 5942|   1.282485717184057|
|            SM-G530H| 5852|   1.263060655833238|
|            SM-J200G| 5797|  1.2511897850077376|
|                A33w| 5618|  1.2125554963211094|
