## Create session

In [1]:
import os
import sys

import json
from datetime import datetime

os.environ["PYSPARK_PYTHON"] = "/opt/anaconda/envs/bd9/bin/python"
os.environ["SPARK_HOME"]     = "/usr/hdp/current/spark2-client"

spark_home = os.environ.get("SPARK_HOME", None)
if not spark_home:
    raise ValueError("SPARK_HOME environment variable is not set")

sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import Row

conf = SparkConf()\
       .setAppName("artem.spitsin_lab01")\
       .set("spark.executor.instances", "2")

ss = SparkSession\
     .builder\
     .appName("artem.spitsin_lab01")\
     .config(conf=conf)\
     .getOrCreate()

sc = ss.sparkContext

ss

## Supporting functions

In [2]:
def timestamp2date(timestamp:str):
    return datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')

def count_rating_films(data:pyspark.rdd.PipelinedRDD, value_rating:int):
    return data.filter(lambda row: row.rating == value_rating).count()

## Loading and preparation data

In [3]:
data = sc.textFile("/labs/laba01/ml-100k/u.data").map(lambda row: row.split("\t"))

data = data.map(
    lambda row: Row(
        user_id = int(row[0]),
        item_id = int(row[1]),
        rating  = int(row[2]),
        time    = timestamp2date(row[3])
    )
)

data.take(5)

[Row(item_id=242, rating=3, time='1997-12-04 18:55:49', user_id=196),
 Row(item_id=302, rating=3, time='1998-04-04 23:22:22', user_id=186),
 Row(item_id=377, rating=1, time='1997-11-07 10:18:36', user_id=22),
 Row(item_id=51, rating=2, time='1997-11-27 08:02:03', user_id=244),
 Row(item_id=346, rating=1, time='1998-02-02 08:33:16', user_id=166)]

## Analysis

In [4]:
data_my_film = data.filter(lambda row: row.item_id == 96)

result_analysis = {
    "hist_film": [],
    "hist_all" : [] 
}

for value_rating in range(1, 6):
    result_analysis["hist_film"].append(count_rating_films(data_my_film, value_rating))
    result_analysis["hist_all"].append(count_rating_films(data, value_rating))
    
result_analysis

{'hist_film': [6, 20, 43, 123, 103],
 'hist_all': [6110, 11370, 27145, 34174, 21201]}

In [5]:
json.dump(
    result_analysis,
    open("lab01.json", "w")
)

## Stopping session

In [6]:
ss.catalog.clearCache()
ss.stop()