In [2]:
from os.path import abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler

import datetime
import os
from time import gmtime, strftime

pyspark_submit_args = "--driver-memory 4g "
pyspark_submit_args += "--executor-memory 4g "
pyspark_submit_args += "pyspark-shell"
os.environ['PYSPARK_SUBMIT_ARGS'] = pyspark_submit_args

warehouse_location = abspath('spark-warehouse')
spark = SparkSession.builder.master("local[*]").appName("SparkSQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
log_df = (
    spark.read
        .option("header", "true")
        .option("delimiter", ",")
        .csv("Data/log.csv")
        .withColumn("enrollment_id", col("enrollment_id").cast("int"))
        .withColumn("date", to_timestamp(substring(col("time"), 0, 10), "yyyy-MM-dd"))
        .withColumn("time", to_timestamp(substring(col("time"), 12, 8), "HH:mm:ss"))
        .withColumnRenamed("object", "object_id")
)

In [4]:
truth_df = (
    spark.read
        .option("header", "true")
        .option("delimiter", ",")
        .schema(StructType([
            StructField("eid", IntegerType()),
            StructField("result", IntegerType()),
        ]))
        .csv("Data/truth_train.csv")
        .withColumn("result", col("result").cast("boolean"))
        .toDF("enrollment_id", "dropout")
)

In [5]:
date_df = (
    spark.read
        .option("header", "true")
        .option("delimiter", ",")
        .csv("Data/date.csv")
        .withColumn("from", to_timestamp(col("from"), "yyyy-MM-dd"))
        .withColumn("to", to_timestamp(col("to"), "yyyy-MM-dd"))
        .withColumnRenamed("from", "fromdate")
        .withColumnRenamed("to", "todate")
)

+-----------+-----------+
|max(period)|min(period)|
+-----------+-----------+
|         29|         29|
+-----------+-----------+



In [6]:
enrollment_df = (
    spark.read
        .option("header", "true")
        .option("delimiter", ",")
        .csv("Data/enrollment.csv")
        .withColumn("enrollment_id", col("enrollment_id").cast("int"))
)

In [47]:
(
    spark.read
        .option("header", "true")
        .option("delimiter", ",")
        .csv("Data/object.csv")
#         .groupBy("course_id").count()
        .orderBy("course_id")
        .show(100)
)

+--------------------+--------------------+-----------+--------------------+-------------------+
|           course_id|           module_id|   category|            children|              start|
+--------------------+--------------------+-----------+--------------------+-------------------+
|1pvLqtotBsKv7QSOs...|HE4yR5LZM1Wq9Lyz5...|      about|                null|               null|
|1pvLqtotBsKv7QSOs...|96Guczn93ROOfoIch...|   vertical|P6VxrSKNtK4nXYxhQ...|               null|
|1pvLqtotBsKv7QSOs...|pqdF2tHTCvAs8kkPp...|      about|                null|               null|
|1pvLqtotBsKv7QSOs...|RQMvncYmORdDO0dFS...|      about|                null|               null|
|1pvLqtotBsKv7QSOs...|Q5onrB70aCDgNxVLL...|    chapter|WEqmLJxWJOF48GDUy...|2013-12-13T04:00:00|
|1pvLqtotBsKv7QSOs...|1Wriu8gSdyrWpIjiU...|    chapter|x5e99zSxOsvLn4m7M...|2013-12-27T06:00:00|
|1pvLqtotBsKv7QSOs...|PlPPm19PUDYQgE8Vp...|    chapter|LkIQNkyK0m7rBrJvx...|2013-12-20T04:00:00|
|1pvLqtotBsKv7QSOs...|xnE1ULAW

In [8]:
def agg_count_by_cols(input_col, output_cols):
    return [
        sum(when(col(input_col) == output_col, col("count")).otherwise(lit(0))).alias("c_" + output_col) 
        for output_col in output_cols
    ]

In [39]:
event_cols = ["access", "discussion", "navigate", "page_close", "problem", "video", "wiki"]  
result1 = (
    log_df
        .groupBy("enrollment_id", "event").count()
        .groupBy("enrollment_id")
        .agg(*agg_count_by_cols("event", event_cols))
)
result1.show(5)

+-------------+--------+------------+----------+------------+---------+-------+------+
|enrollment_id|c_access|c_discussion|c_navigate|c_page_close|c_problem|c_video|c_wiki|
+-------------+--------+------------+----------+------------+---------+-------+------+
|        35071|     176|           2|        20|          59|        0|     58|     0|
|        87120|      12|           0|         5|           6|        0|      3|     0|
|       126365|      80|          16|        14|          39|       32|     34|     0|
|       128367|       2|           0|         2|           1|        0|      1|     0|
|       140266|      16|           0|         5|          10|        2|      5|     0|
+-------------+--------+------------+----------+------------+---------+-------+------+
only showing top 5 rows



In [40]:
source_cols = ["browser", "server"]   
result2 = (
    log_df
        .groupBy("enrollment_id", "source").count()
        .groupBy("enrollment_id")
        .agg(*agg_count_by_cols("source", source_cols))
)
result2.show(5)

+-------------+---------+--------+
|enrollment_id|c_browser|c_server|
+-------------+---------+--------+
|        20683|       29|      34|
|        23271|        9|       8|
|        29228|       61|      78|
|        29744|       67|     125|
|        33717|        6|      18|
+-------------+---------+--------+
only showing top 5 rows



In [11]:
result3 = (
    enrollment_df
        .groupBy("course_id")
        .agg(count("enrollment_id").alias("count_eid_per_course"))
        .orderBy("course_id")
)
result3.show(5)

+--------------------+--------------------+
|           course_id|count_eid_per_course|
+--------------------+--------------------+
|1pvLqtotBsKv7QSOs...|                2392|
|3VkHkmOtom3jM2wCu...|                2008|
|3cnZpv6ReApmCaZya...|                2207|
|5Gyp41oLVo7Gg7vF4...|                2992|
|5X6FeZozNMgE2VRi3...|                 898|
+--------------------+--------------------+
only showing top 5 rows



In [12]:
dropout_df = (
    enrollment_df
        .join(truth_df, ["enrollment_id"])
        .filter(col("dropout") == 1)
        .groupBy("course_id")
        .agg(count("enrollment_id").alias("dropout_per_course"))
        .orderBy("course_id")
)
dropout_df.cache()
dropout_df.show(5)

+--------------------+------------------+
|           course_id|dropout_per_course|
+--------------------+------------------+
|1pvLqtotBsKv7QSOs...|              1383|
|3VkHkmOtom3jM2wCu...|              1217|
|3cnZpv6ReApmCaZya...|              1290|
|5Gyp41oLVo7Gg7vF4...|              1930|
|5X6FeZozNMgE2VRi3...|               603|
+--------------------+------------------+
only showing top 5 rows



In [13]:
droupout_rate_df = (
    course_df
        .join(dropout_df, ["course_id"])
        .select(
            col("course_id"), 
            (col("dropout_per_course") / col("count_eid_per_course")).alias("DropoutRate")
        )
        .orderBy("course_id")
)
droupout_rate_df.show(5)

+--------------------+------------------+
|           course_id|       DropoutRate|
+--------------------+------------------+
|1pvLqtotBsKv7QSOs...|0.5781772575250836|
|3VkHkmOtom3jM2wCu...|0.6060756972111554|
|3cnZpv6ReApmCaZya...|0.5845038513819665|
|5Gyp41oLVo7Gg7vF4...|0.6450534759358288|
|5X6FeZozNMgE2VRi3...|0.6714922048997772|
+--------------------+------------------+
only showing top 5 rows



In [14]:
user_enrollment_count_df = (
    enrollment_df
        .groupBy("username")
        .agg(count("enrollment_id").alias("enrollment_count"))
        .orderBy("username")
)
user_enrollment_count_df.show(5)

+--------------------+----------------+
|            username|enrollment_count|
+--------------------+----------------+
|00038q9llTDdhWUJP...|               1|
|001Wosm650x4ktE3N...|               2|
|001tRjfJQIzbegatO...|               1|
|0089b3aJIRi14gwpk...|               2|
|008XUUt5rc6hUrg7S...|               1|
+--------------------+----------------+
only showing top 5 rows



In [15]:
user_dropout_count_df = (
    enrollment_df
        .join(truth_df, ["enrollment_id"])
        .filter(col("dropout"))
        .groupBy("username").count()
        .withColumnRenamed("count", "dropout_count")
        .orderBy("username")
)
user_dropout_count_df.show(5)

+--------------------+-------------+
|            username|dropout_count|
+--------------------+-------------+
|00038q9llTDdhWUJP...|            1|
|001Wosm650x4ktE3N...|            2|
|0089b3aJIRi14gwpk...|            2|
|00DCGVn7t4aRvR2Cs...|            3|
|00DkxnJmW7N1BC73X...|            1|
+--------------------+-------------+
only showing top 5 rows



In [16]:
(
    user_enrollment_count_df
        .join(user_dropout_count_df, ["username"])
        .select(
            col("username"),
            (col("dropout_count") / col("enrollment_count")).alias("DropoutRate")
        )
        .show(5)
)

+--------------------+-----------+
|            username|DropoutRate|
+--------------------+-----------+
|00038q9llTDdhWUJP...|        1.0|
|001Wosm650x4ktE3N...|        1.0|
|0089b3aJIRi14gwpk...|        1.0|
|00DCGVn7t4aRvR2Cs...|        1.0|
|00DkxnJmW7N1BC73X...|        1.0|
+--------------------+-----------+
only showing top 5 rows



In [19]:
(
    enrollment_df
        .join(log_df, ["enrollment_id"])
        .groupBy("course_id", "username")
        .agg(min("date").alias("first_log_time"), max("date").alias("last_log_time"))
        .withColumn("period", datediff(col("last_log_time"), col("first_log_time")))
        .orderBy("course_id")
        .show()
)

+--------------------+--------------------+-------------------+-------------------+------+
|           course_id|            username|     first_log_time|      last_log_time|period|
+--------------------+--------------------+-------------------+-------------------+------+
|1pvLqtotBsKv7QSOs...|m0g02LLsbqZWAE9Co...|2013-11-26 00:00:00|2013-11-29 00:00:00|     3|
|1pvLqtotBsKv7QSOs...|bvSdd28WpB7SHMz1h...|2013-11-27 00:00:00|2013-12-03 00:00:00|     6|
|1pvLqtotBsKv7QSOs...|EaDnLkVMEsYfdUTQm...|2013-12-18 00:00:00|2013-12-18 00:00:00|     0|
|1pvLqtotBsKv7QSOs...|hdHnMG0t4845BoJsf...|2013-11-27 00:00:00|2013-12-23 00:00:00|    26|
|1pvLqtotBsKv7QSOs...|TcKNtfskDez5flvkR...|2013-11-30 00:00:00|2013-11-30 00:00:00|     0|
|1pvLqtotBsKv7QSOs...|L5ih0rvnVHl900daO...|2013-11-27 00:00:00|2013-12-25 00:00:00|    28|
|1pvLqtotBsKv7QSOs...|PZDWz87kpnLsW0EEs...|2013-11-27 00:00:00|2013-12-11 00:00:00|    14|
|1pvLqtotBsKv7QSOs...|GGioMvSJ2TI5A4koI...|2013-11-27 00:00:00|2013-12-19 00:00:00|    22|

In [54]:
(
    enrollment_df
        .join(log_df, ["enrollment_id"])
        .groupBy("course_id", "username")
        .agg(countDistinct(col("date")).alias("effective_study_days"))
        .orderBy(desc("effective_study_days"))
        .show()
)

+--------------------+--------------------+--------------------+
|           course_id|            username|effective_study_days|
+--------------------+--------------------+--------------------+
|q6A6QG7qMpyNcznyT...|bhmI4wUi4dYE8RHNn...|                  30|
|shM3Yy9vxHn2aqjSY...|SSpeewBDHUNKkvUJM...|                  30|
|mTmmr5zd8l4wXhwiU...|USLTnsAwagwK2c83K...|                  30|
|9Bd26pfDLvkPINwLn...|7m4aBRXMcWXy37TP9...|                  30|
|HbeAZjZFFQUe90oTP...|PNXl2SBlhwK5mHxVh...|                  30|
|9Bd26pfDLvkPINwLn...|USLTnsAwagwK2c83K...|                  30|
|9Bd26pfDLvkPINwLn...|MLIbW3bL7O6QPslL7...|                  30|
|bWdj2GDclj5ofokWj...|rT1OrFQC18rdvYkTG...|                  30|
|I7Go4XwWgpjRJM8EZ...|OTXI4wiLpFntQK2eN...|                  30|
|RXDvfPUBYFlVdlueB...|XjrP1VAGwJCwkRm4K...|                  30|
|shM3Yy9vxHn2aqjSY...|Oiijrt01HwezoanXS...|                  30|
|ykoe1cCWK134BJmfb...|oSETRB1geWgwDVgBT...|                  30|
|5Gyp41oLVo7Gg7vF4...|ZhT

In [33]:
(
    enrollment_df
        .join(log_df, ["enrollment_id"])
        .join(truth_df, ["enrollment_id"])
        .filter(~col("dropout"))
        .groupBy("course_id", "enrollment_id", "event").count()
        .withColumnRenamed("count", "count_event")
        .orderBy("course_id")
        .show()
)

+--------------------+-------------+----------+-----------+
|           course_id|enrollment_id|     event|count_event|
+--------------------+-------------+----------+-----------+
|1pvLqtotBsKv7QSOs...|       115219|discussion|        168|
|1pvLqtotBsKv7QSOs...|       199619|  navigate|          3|
|1pvLqtotBsKv7QSOs...|       171384|page_close|          9|
|1pvLqtotBsKv7QSOs...|       111107|  navigate|         61|
|1pvLqtotBsKv7QSOs...|       113468|     video|         36|
|1pvLqtotBsKv7QSOs...|       111238|page_close|         22|
|1pvLqtotBsKv7QSOs...|        13886|  navigate|         26|
|1pvLqtotBsKv7QSOs...|       112390|      wiki|          4|
|1pvLqtotBsKv7QSOs...|       123136|      wiki|          2|
|1pvLqtotBsKv7QSOs...|       126940|discussion|          5|
|1pvLqtotBsKv7QSOs...|       128696|page_close|         41|
|1pvLqtotBsKv7QSOs...|       133083|     video|         69|
|1pvLqtotBsKv7QSOs...|       152279|     video|         26|
|1pvLqtotBsKv7QSOs...|       111178|disc

In [36]:
(
    enrollment_df
        .join(log_df, ["enrollment_id"])
        .join(truth_df, ["enrollment_id"])
        .filter(~col("dropout"))
        .groupBy("course_id", "enrollment_id", "event").count()
        .withColumnRenamed("count", "count_event")
        .groupBy(col("course_id").alias("c_id"), col("event").alias("sq_event")).agg(avg("count_event"))
        .orderBy("c_id")
        .show()
)

+--------------------+----------+------------------+
|                c_id|  sq_event|  avg(count_event)|
+--------------------+----------+------------------+
|1pvLqtotBsKv7QSOs...|   problem|32.845697329376854|
|1pvLqtotBsKv7QSOs...|  navigate|21.324257425742573|
|1pvLqtotBsKv7QSOs...|page_close|26.376288659793815|
|1pvLqtotBsKv7QSOs...|discussion| 18.91830065359477|
|1pvLqtotBsKv7QSOs...|      wiki|2.7465437788018434|
|1pvLqtotBsKv7QSOs...|     video| 40.42368421052632|
|1pvLqtotBsKv7QSOs...|    access| 94.90306122448979|
|3VkHkmOtom3jM2wCu...|discussion|26.024752475247524|
|3VkHkmOtom3jM2wCu...|   problem| 56.52486187845304|
|3VkHkmOtom3jM2wCu...|     video|36.567460317460316|
|3VkHkmOtom3jM2wCu...|  navigate|24.143356643356643|
|3VkHkmOtom3jM2wCu...|    access| 123.6390977443609|
|3VkHkmOtom3jM2wCu...|      wiki|3.6283783783783785|
|3VkHkmOtom3jM2wCu...|page_close|44.511538461538464|
|3cnZpv6ReApmCaZya...|    access| 80.97765363128492|
|3cnZpv6ReApmCaZya...|discussion| 19.092592592

In [None]:
columns_list = joined_result_no_null.columns
columns_list.remove("enrollment_id")

assembler = VectorAssembler(inputCols=columns_list, outputCol="feature")
assemble_result = assembler.transform(joined_result_no_null)

for column in columns_list:
    assemble_result = assemble.result.drop(column)
    
final_result = assemble_result.sort("enrollment_id", ascending=True)