# Step1. Install PySpark, Java, Hadoop (required to run PySpark)  

In [1]:
import os
import pandas as pd

In [2]:
os.getcwd()

'/Users/ZiweiMengyang/Desktop/Python & Machine Learning/Tiger/Capstone/Codes'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql.types import *
from pyspark.sql.functions import * 

In [None]:
# PySpark is the Python API for Spark
# SparkContext is a public class in PySpark. It is the main entry point for Spark functionality in Python
# .getOrCreate() is a classmethod that instantiate a SparkContext

from pyspark import SparkContext 
sc = SparkContext.getOrCreate()
# i.e. sc is created as a SparkContext class

####  Next, create pyspark.sql.SparkSession, which is the main entry point for DataFrame and SQL functionality

In [5]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Step 2. Parse log file into RDD then into DataFrame

In [108]:
# textFile() reads a text from HDFS, a local file system, or Hadoop-supported file system,
# and return it as an RDD of strings

rdd = sc.textFile(name="../Data/all_play_log.log.fn")

In [37]:
# takes 4 minutes to run.... cautious
rdd.count()

164651375

In [110]:
# read a sample log, get a sense how to create a schema for parsing log file into DataFrame
# schema_orig = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag']
# df = pd.read_csv('../Data/Play/20170331_1_play.log',delimiter='\t',header=None,index_col=None,names=schema)
# df.info()


In [111]:
def parseLine(line):
    fields = line.split("\t")
    if len(fields) == 10:
        try: 
            uid = float(fields[0])
            device = str(fields[1])
            song_id = float(fields[2])
            song_type = float(fields[3])
            song_name = str(fields[4])
            singer = str(fields[5])
            play_time = float(fields[6])
            song_length = float(fields[7])
            paid_flag = float(fields[8])
            file_name = str(fields[9])
            return Row(uid, device, song_id, song_type, song_name, singer, play_time, song_length, paid_flag, file_name)
        except:
            return Row(None)
    else:
        return Row(None)
    
    
    
    
# Create Row entries that specify column name, to prepare the RDD to convert it to a DataFrame
# Always important to filter on field length after splitting, to avoid "index out of range error"


In [112]:
# Provide schema in order to create DataFrame
# Spark SQL StructType is the data type representing rows. 
# A StructType object comprises a list of StructField, which represents a field in a StructType
# StructField(name of this field, dataType, nullable)


schema = StructType([StructField('uid', FloatType(), False),
                     StructField('device', StringType(), True),
                     StructField('song_id', FloatType(), False),
                     StructField('song_type', FloatType(), True),
                     StructField('song_name', StringType(), True),
                     StructField('singer', StringType(), True),
                     StructField('play_time', FloatType(), False),
                     StructField('song_length', FloatType(), True),
                     StructField('paid_flag', FloatType(), True),
                     StructField('file_name', StringType(), True),])

In [9]:
len(schema)

10

In [113]:
songs = rdd.map(parseLine).filter(lambda x: len(x) == len(schema))

In [169]:
song_df = spark.createDataFrame(songs, schema).cache()

In [12]:
pd.DataFrame(song_df.take(5), columns=song_df.columns)

Unnamed: 0,uid,device,song_id,song_type,song_name,singer,play_time,song_length,paid_flag,file_name
0,154422688.0,ar,20870992.0,1.0,用情,狮子合唱团,22013.0,332.0,0.0,20170301_play.log
1,154421904.0,ip,6560858.0,0.0,表情不要悲伤,伯贤&D.O.&张艺兴&朴灿烈,96.0,161.0,0.0,20170301_play.log
2,154422624.0,ar,3385963.0,1.0,"Baby, Don't Cry(人鱼的眼泪)",EXO,235868.0,235.0,0.0,20170301_play.log
3,154410272.0,ar,6777172.0,0.0,3D-环绕音律1(3D Mix),McTaiM,164.0,237.0,0.0,20170301_play.log
4,154407792.0,ar,19472464.0,0.0,刚好遇见你,曲肖冰,24.0,201.0,0.0,20170301_play.log


# Step 3. Sanity Check 

In [15]:
song_df.groupBy('uid').count().orderBy('count', ascending = False).show(truncate=False)
# takes hours to run... DO NOT rerun!

+------------+-------+
|uid         |count  |
+------------+-------+
|1685126.0   |8123179|
|3.7025504E7 |5903384|
|751824.0    |4554030|
|1791497.0   |3375423|
|497685.0    |3031075|
|1062806.0   |2354473|
|736305.0    |1848836|
|0.0         |1201066|
|1749320.0   |835075 |
|4.6532272E7 |500025 |
|1679121.0   |488562 |
|2.8638488E7 |469612 |
|637650.0    |243074 |
|1.5594824E8 |217988 |
|533817.0    |173401 |
|3.2166204E7 |156591 |
|6.4268008E7 |150167 |
|2.6036032E7 |114145 |
|3.2104144E7 |99175  |
|1.67982848E8|82687  |
+------------+-------+
only showing top 20 rows



In [115]:
song_df.select('play_time', 'song_length', 'paid_flag').describe().show()

+-------+--------------+------------------+---------+
|summary|     play_time|       song_length|paid_flag|
+-------+--------------+------------------+---------+
|  count|     163014069|         163014069|163014069|
|   mean|           NaN|-260.0733923612918|      0.0|
| stddev|           NaN|1070712.7989508465|      0.0|
|    min|-2.61648826E11|     -2.14748365E9|      0.0|
|    max|           NaN|      1.34396621E9|      0.0|
+-------+--------------+------------------+---------+



### Data cleaning: play_time>0, song_length < 0.  song_length if NaN, impute using play_time

In [170]:
song_df = song_df.filter(song_df.play_time >= 0) \
                 .filter(song_df.song_length > 0) \
                 .dropna(how='any', subset=['play_time']) \
                 .cache()

In [183]:
song_df.approxQuantile("play_time", [0.95], 0)

Py4JJavaError: An error occurred while calling o1373.approxQuantile.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 513.0 failed 1 times, most recent failure: Lost task 3.0 in stage 513.0 (TID 19424, localhost, executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$aggregate$1.apply(RDD.scala:1115)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1108)
	at org.apache.spark.sql.execution.stat.StatFunctions$.multipleApproxQuantiles(StatFunctions.scala:98)
	at org.apache.spark.sql.DataFrameStatFunctions.approxQuantile(DataFrameStatFunctions.scala:100)
	at org.apache.spark.sql.DataFrameStatFunctions.approxQuantile(DataFrameStatFunctions.scala:115)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [171]:
pd.DataFrame(song_df.take(5), columns=song_df.columns)

Unnamed: 0,uid,device,song_id,song_type,song_name,singer,play_time,song_length,paid_flag,file_name
0,154422688.0,ar,20870992.0,1.0,用情,狮子合唱团,22013.0,332.0,0.0,20170301_play.log
1,154421904.0,ip,6560858.0,0.0,表情不要悲伤,伯贤&D.O.&张艺兴&朴灿烈,96.0,161.0,0.0,20170301_play.log
2,154422624.0,ar,3385963.0,1.0,"Baby, Don't Cry(人鱼的眼泪)",EXO,235868.0,235.0,0.0,20170301_play.log
3,154410272.0,ar,6777172.0,0.0,3D-环绕音律1(3D Mix),McTaiM,164.0,237.0,0.0,20170301_play.log
4,154407792.0,ar,19472464.0,0.0,刚好遇见你,曲肖冰,24.0,201.0,0.0,20170301_play.log


## **  Finding: Obviously some uid are testing accounts (i.e. robots) and should be excluded from the sample

In [14]:
uid_count = song_df.groupBy('uid').count().orderBy('count', ascending = False).cache()

In [15]:
# calculate the 95 percentile to be 2965
count_ceiling = uid_count.approxQuantile("count", [0.95], 0)

In [17]:
print("95 percentile of play counts is {:0}".format(count_ceiling[0]))

95 percentile of play counts is 2965.0


In [18]:
uid_count.printSchema()
valid_uid = uid_count.filter(uid_count['count'] <= count_ceiling[0])
# .toPandas() removed,  for join df purpose below 

root
 |-- uid: float (nullable = false)
 |-- count: long (nullable = false)



In [55]:
print("number of valid users = {0:0}, \n number of valid plays = {1:.2e}"
      .format(valid_uid.shape[0], valid_uid["count"].sum()))


number of valid users = 135543, 
 number of valid plays = 9.84e+07


## ** Save valid_uid to a local csv file !!

In [26]:
valid_uid.repartition(1).write.csv('../Data/valid_uid', header=True)

In [142]:
valid_uid.show()

+------------+-----+
|         uid|count|
+------------+-----+
| 9.4891472E7| 2965|
| 1.6849104E8| 2965|
|1.68749728E8| 2965|
|1.68148144E8| 2965|
| 1.4707248E8| 2965|
|1.67592272E8| 2965|
|1.68414112E8| 2965|
|1.68193264E8| 2964|
|1.68230784E8| 2964|
|1.66444672E8| 2963|
| 7.2036656E7| 2963|
|1.68756624E8| 2963|
|1.67647824E8| 2963|
|1.68999216E8| 2963|
|1.68460208E8| 2962|
|1.67875504E8| 2962|
|1.67628128E8| 2962|
|1.68355408E8| 2962|
|1.67890272E8| 2962|
|1.68555648E8| 2962|
+------------+-----+
only showing top 20 rows



#### remain only valid (non-robot) uid

In [172]:
song_df_valid = song_df.join(valid_uid, on='uid', how='inner') \
                        .select('uid', 'device', 'song_id', 'song_type', 'play_time', 'song_length', 'paid_flag', 'file_name') \
                        .cache()
    

In [173]:
song_df_valid.show()

+------------+------+-----------+---------+---------+-----------+---------+------------------+
|         uid|device|    song_id|song_type|play_time|song_length|paid_flag|         file_name|
+------------+------+-----------+---------+---------+-----------+---------+------------------+
|1.54422688E8|   ar |2.0870992E7|      1.0|  22013.0|      332.0|      0.0| 20170301_play.log|
|1.54421904E8|   ip |  6560858.0|      0.0|     96.0|      161.0|      0.0| 20170301_play.log|
|1.54422624E8|   ar |  3385963.0|      1.0| 235868.0|      235.0|      0.0| 20170301_play.log|
|1.54410272E8|   ar |  6777172.0|      0.0|    164.0|      237.0|      0.0| 20170301_play.log|
|1.54407792E8|   ar |1.9472464E7|      0.0|     24.0|      201.0|      0.0| 20170301_play.log|
|1.54422688E8|   ar |   891952.0|      0.0|    300.0|      300.0|      0.0| 20170301_play.log|
|1.54408096E8|   ar |  4623962.0|      0.0|    243.0|      243.0|      0.0| 20170301_play.log|
|1.54422576E8|   ar |   703750.0|      0.0|    207

# Step 4. create churn label 

In [174]:
song_df_valid_2 = song_df_valid.withColumn("device", trim(song_df_valid.device)) \
                               .withColumn('date_str', trim(song_df_valid.file_name.substr(1,9))) \
                               .withColumn('date_string', regexp_replace('date_str', '20170339', '20170329')) \
                               .withColumn("unix_date", unix_timestamp('date_string', 'yyyyMMdd')) \
                               .withColumn("date", from_unixtime('unix_date').cast(DateType())) \
                               .drop('date_str') \
                               .drop('date_string') \
                               .drop('unix_date')
                            
                                
                              

In [175]:
song_df_valid_2.show()

+------------+------+-----------+---------+---------+-----------+---------+------------------+----------+
|         uid|device|    song_id|song_type|play_time|song_length|paid_flag|         file_name|      date|
+------------+------+-----------+---------+---------+-----------+---------+------------------+----------+
|1.54422688E8|    ar|2.0870992E7|      1.0|  22013.0|      332.0|      0.0| 20170301_play.log|2017-03-01|
|1.54421904E8|    ip|  6560858.0|      0.0|     96.0|      161.0|      0.0| 20170301_play.log|2017-03-01|
|1.54422624E8|    ar|  3385963.0|      1.0| 235868.0|      235.0|      0.0| 20170301_play.log|2017-03-01|
|1.54410272E8|    ar|  6777172.0|      0.0|    164.0|      237.0|      0.0| 20170301_play.log|2017-03-01|
|1.54407792E8|    ar|1.9472464E7|      0.0|     24.0|      201.0|      0.0| 20170301_play.log|2017-03-01|
|1.54422688E8|    ar|   891952.0|      0.0|    300.0|      300.0|      0.0| 20170301_play.log|2017-03-01|
|1.54408096E8|    ar|  4623962.0|      0.0|   

In [181]:
# use pandas DataFrame to perform quick sanity check 
pd.DataFrame(song_df_valid_2.take(5), columns=song_df_valid_2.columns)

Unnamed: 0,uid,device,song_id,song_type,play_time,song_length,paid_flag,file_name,date
0,154422688.0,ar,20870992.0,1.0,22013.0,332.0,0.0,20170301_play.log,2017-03-01
1,154421904.0,ip,6560858.0,0.0,96.0,161.0,0.0,20170301_play.log,2017-03-01
2,154422624.0,ar,3385963.0,1.0,235868.0,235.0,0.0,20170301_play.log,2017-03-01
3,154410272.0,ar,6777172.0,0.0,164.0,237.0,0.0,20170301_play.log,2017-03-01
4,154407792.0,ar,19472464.0,0.0,24.0,201.0,0.0,20170301_play.log,2017-03-01


In [182]:
# play log dated from 20170301 to 20170512, 
# use last 2 week as churn window 
active_uid = song_df_valid_2.filter(song_df_valid_2.date >= '2017-04-29') \
                            .select(song_df_valid_2.uid.alias('active_uid')) \
                            .distinct()

active_uid.repartition(1).write.csv('../Data/active_uid', header=True)

In [62]:
active_uid.schema

StructType(List(StructField(active_uid,FloatType,false)))

In [None]:
# in case session is terminated, to save time, just read from csv
# active_uid = spark.read.csv('../Data/active_uid.csv')
# valid_uid = spark.read.csv('../Data/valid_uid.csv')

In [None]:
# song_df.describe().show()
# takes forever to run... 

In [184]:
uid_label = valid_uid.join(active_uid, valid_uid.uid == active_uid.active_uid, 'left_outer') 
uid_label = uid_label.withColumn('churn', uid_label.active_uid.isNull().astype(IntegerType())).cache()

In [64]:
# sanity check
pd.DataFrame(uid_label.take(5), columns=uid_label.columns)

Unnamed: 0,uid,count,active_uid,churn
0,13586118.0,445,,1
1,16844004.0,2,,1
2,22030996.0,347,22030996.0,0
3,23232528.0,181,,1
4,23885908.0,45,,1


In [65]:
# write output to csv, share with team members
uid_label.select('uid', 'churn').repartition(1).write.csv('../Data/uid_label', header=True)

# Step 5. Downsample such that churn categories (0, 1) weigh equally

In [185]:
uid_label.groupBy('churn').count().show()

KeyboardInterrupt: 

#### find out churn weight of original data 

In [71]:
51541/83885

0.6144245097454849

In [72]:
sampled_uid = uid_label.drop('active_uid', 'count').sampleBy('churn', fractions={1:0.05, 0:0.03}, seed=0)
sampled_uid.groupBy('churn').count().show()

+-----+-----+
|churn|count|
+-----+-----+
|    1| 2610|
|    0| 2520|
+-----+-----+



In [78]:
sampled_uid.cache()

DataFrame[uid: float, churn: int]

# Step 6. Read logs (Play, Search, Download) that are in the sample selected 

In [97]:
play_sample = song_df_valid_2.filter(song_df_valid_2.date < 20170429) \
                             .join(sampled_uid, sampled_uid.uid == song_df_valid_2.uid, 'inner') \
                             .drop(sampled_uid.uid) \
                             .cache()

In [103]:
play_sample.select('play_time', 'song_length').describe().show()

+-------+-----------+-----------------+
|summary|  play_time|      song_length|
+-------+-----------+-----------------+
|  count|    2720028|          2720028|
|   mean|        NaN|247.7157280182452|
| stddev|        NaN|299.5685704649973|
|    min|-0.07745105|             -1.0|
|    max|        NaN|          32757.0|
+-------+-----------+-----------------+



### Finding: delete play_time = NaN, play_time < 0,  
### If song_length < play_time, then impute it to play_time

In [101]:
play_sample.groupBy('date').count().orderBy('count').show(90)

+--------+------+
|    date| count|
+--------+------+
|20170424| 24675|
|20170414| 37302|
|20170428| 42661|
|20170411| 42966|
|20170308| 47115|
|20170426| 47236|
|20170309| 47630|
|20170423| 48384|
|20170425| 49475|
|20170420| 49599|
|20170422| 50720|
|20170307| 51115|
|20170427| 51433|
|20170421| 51708|
|20170418| 52080|
|20170419| 52158|
|20170417| 53383|
|20170306| 54877|
|20170339| 55152|
|20170413| 59191|
|20170415| 60249|
|20170403| 61104|
|20170416| 62363|
|20170412| 62977|
|20170305| 63909|
|20170410| 66176|
|20170408| 69513|
|20170409| 72411|
|20170407| 72465|
|20170304| 72795|
|20170405| 72981|
|20170406| 73693|
|20170303| 76124|
|20170404| 94512|
|20170302|102326|
|20170402|111841|
|20170401|124312|
|20170330|130265|
|20170301|147427|
|20170331|153725|
+--------+------+



In [102]:
play_sample.groupBy('date').count().orderBy('date').show(90)

+--------+------+
|    date| count|
+--------+------+
|20170301|147427|
|20170302|102326|
|20170303| 76124|
|20170304| 72795|
|20170305| 63909|
|20170306| 54877|
|20170307| 51115|
|20170308| 47115|
|20170309| 47630|
|20170330|130265|
|20170331|153725|
|20170339| 55152|
|20170401|124312|
|20170402|111841|
|20170403| 61104|
|20170404| 94512|
|20170405| 72981|
|20170406| 73693|
|20170407| 72465|
|20170408| 69513|
|20170409| 72411|
|20170410| 66176|
|20170411| 42966|
|20170412| 62977|
|20170413| 59191|
|20170414| 37302|
|20170415| 60249|
|20170416| 62363|
|20170417| 53383|
|20170418| 52080|
|20170419| 52158|
|20170420| 49599|
|20170421| 51708|
|20170422| 50720|
|20170423| 48384|
|20170424| 24675|
|20170425| 49475|
|20170426| 47236|
|20170427| 51433|
|20170428| 42661|
+--------+------+



In [84]:
pd.DataFrame(play_sample.take(5), columns = play_sample.columns)

Unnamed: 0,uid,device,song_id,song_type,play_time,song_length,paid_flag,date,churn
0,71357496.0,ar,23498554.0,0.0,0.0,230.0,0.0,20170404,1
1,71357496.0,ar,20274308.0,0.0,0.0,226.0,0.0,20170404,1
2,71357496.0,ar,10104502.0,0.0,0.0,291.0,0.0,20170404,1
3,71357496.0,ar,9871706.0,0.0,0.0,319.0,0.0,20170404,1
4,71357496.0,ar,23498554.0,0.0,0.0,230.0,0.0,20170404,1


In [90]:
play_sample.groupBy(play_sample.uid) \
           .agg({"play_time": "avg", "play_time": "min", "play_time": "max", \
                 "song_length": "avg", "song_length": "min", "song_length": "max"}) \
           .orderBy('max(play_time)') \
           .show()

+------------+----------------+--------------+
|         uid|max(song_length)|max(play_time)|
+------------+----------------+--------------+
| 9.0333984E7|             0.0|           0.0|
| 1.1485704E8|             0.0|           0.0|
| 5.2824928E7|             0.0|           0.0|
|  2.347707E7|           466.0|           0.0|
| 1.6914496E8|             0.0|           0.0|
|1.69146096E8|             0.0|           0.0|
|1.65142592E8|             0.0|           0.0|
| 1.5493048E8|           299.0|           0.0|
|    400904.0|           372.0|           0.0|
| 7.8326016E7|             0.0|           0.0|
| 1.5685424E8|             0.0|           0.0|
|1.55374192E8|          5038.0|           0.0|
|1.66336304E8|           327.0|           0.0|
|1.11451968E8|           294.0|           0.0|
|1.54030576E8|           280.0|           0.0|
| 7.1357496E7|           319.0|           0.0|
| 1.5377896E8|             0.0|           0.0|
| 9.4849952E7|             0.0|           0.0|
| 1.6782832E8

In [91]:
play_sample.groupBy(play_sample.churn) \
           .agg({"play_time": "avg", "play_time": "sum", "play_time": "min", "play_time": "max", \
                 "song_length": "avg", "song_length": "sum", "song_length": "min", "song_length": "max", \
                 "song_type": "avg", \
                 "paid_flag": "avg"}) \
           .show()

+-----+----------------+-------------------+--------------+--------------+
|churn|max(song_length)|     avg(song_type)|max(play_time)|avg(paid_flag)|
+-----+----------------+-------------------+--------------+--------------+
|    1|         14724.0|0.09861014618311559|           NaN|           0.0|
|    0|         32757.0|0.10539881703994151|  3.39338957E9|           0.0|
+-----+----------------+-------------------+--------------+--------------+



In [98]:
play_sample.crosstab('churn', 'device').show()

+------------+-------+------+
|churn_device|     ar|    ip|
+------------+-------+------+
|           1| 719224|119929|
|           0|1554978|325897|
+------------+-------+------+



#### churn - 14% use iphone
#### not churn - 17% use iphone

###  *** Finding: device field is not clean, needs trimming 

In [None]:
uid_set = set(sampled_uid.select('uid'))
with open("../Data/all_play_log.log.fn") as infile:
    for line in infile:
        if line.splint('\t')

### Convert RDD to DataFrame, from YouTube Video:
https://www.youtube.com/watch?v=dzYEWULDIAQ&list=PLE50-dh6JzC5zo2whIGqJ02CIhP3ysQLX&index=5

In [46]:
# map RDD to a DataFrame
# important to filter away lines that do not contain 10 elements
# otherwise, get "Index out of range error"

df = rdd.map(lambda line: line.split("\t")).filter(lambda line: len(line) == 10)
df = (df.map(lambda line: Row(uid = line[0], 
                             device = line[1], 
                             song_id = line[2], 
                             song_type = line[3], 
                             singer = line[5], 
                             play_time = line[6], 
                             song_length = line[7], 
                             paid_flag = line[8], 
                             file_name = line[9]))
      .toDF())

KeyboardInterrupt: 

In [None]:
df.columns

In [19]:
df.show(10)

+------+------------------+---------+---------+--------------------+---------+-----------+--------------------+---------+----------+
|device|         file_name|paid_flag|play_time|              singer|  song_id|song_length|           song_name|song_type|       uid|
+------+------------------+---------+---------+--------------------+---------+-----------+--------------------+---------+----------+
|   ar | 20170301_play.log|       0 |   22013 |              狮子合唱团 |20870993 |       332 |                 用情 |       1 |154422682 |
|   ip | 20170301_play.log|       0 |      96 |    伯贤&D.O.&张艺兴&朴灿烈 | 6560858 |       161 |             表情不要悲伤 |       0 |154421907 |
|   ar | 20170301_play.log|       0 |  235868 |                EXO | 3385963 |       235 |Baby, Don't Cry(人...|       1 |154422630 |
|   ar | 20170301_play.log|       0 |     164 |             McTaiM | 6777172 |       237 |   3D-环绕音律1(3D Mix) |       0 |154410267 |
|   ar | 20170301_play.log|       0 |      24 |                曲肖冰 |1

In [None]:
df.describe().show()