## Create session

In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = "/opt/anaconda/envs/bd9/bin/python"
os.environ["SPARK_HOME"]     = "/usr/hdp/current/spark2-client"

spark_home = os.environ.get("SPARK_HOME", None)
if not spark_home:
    raise ValueError("SPARK_HOME environment variable is not set")

sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

import pyspark
from pyspark import keyword_only
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml import Transformer 
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.param import Param, Params, TypeConverters

import re
import json

from tqdm import tqdm

import pandas as pd # For using pd.notna

conf = SparkConf()\
       .setAppName("artem.spitsin_lab02")\
       .set("spark.executor.instances", "2")

ss = SparkSession\
     .builder\
     .appName("artem.spitsin_lab02")\
     .config(conf=conf)\
     .getOrCreate()
ss

## Implementation tokenizer

In [2]:
# Implementing a custom RegexTokenizer to add unicode support
class CustomRegexTokenizer(Transformer, HasInputCol, HasOutputCol):
    """
    Tokenizes a string based on a substring search based on a given 
    regular expression pattern (supports unicode).
    """
    pattern = Param(
        Params._dummy(), "pattern", 
        "a regular expression for allocating tokens",
        typeConverter=TypeConverters.toString
    )
    
    @keyword_only
    def __init__(self, inputCol:str=None, outputCol:str=None, pattern:str="[\w]{2,}"):
        super(CustomRegexTokenizer, self).__init__()
        kwargs = self._input_kwargs
        self._set(**kwargs)
    
    def _transform(self, data):  
        tokenize = F.udf(
            returnType=ArrayType(StringType()),
            f=lambda text, pattern: re.findall(re.compile(pattern, re.U), text.lower())
        )
        
        return data.withColumn(
            self.getOutputCol(), 
            tokenize(
                self.getInputCol(), 
                F.lit(self.getOrDefault("pattern"))
            )
        )

## Loading data

In [3]:
## My courses to make recommendations
given_courses = [
    [23126, u'en'], [21617, u'en'], [16627, u'es'], [11556, u'es'], [16704, u'ru'], [13702, u'ru']
]

In [4]:
courses = ss.read.json("/labs/slaba02/DO_record_per_line.json")
courses.show(3)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 3 rows



## Creating pipelines

In [5]:
map_languages = {
    "ru": "russian",
    "en": "english",
    "es": "spanish"
}

tokenizer = CustomRegexTokenizer(inputCol="desc", outputCol="tokens", pattern=u"[\w\d]{2,}")

pipelines = {}
for key_lang, language in map_languages.items():
    
    removerStopWords = StopWordsRemover(
        inputCol=tokenizer.getOutputCol(), 
        outputCol="clear_tokens", 
        stopWords=StopWordsRemover.loadDefaultStopWords(language)
    )
    
    hasher = HashingTF(numFeatures=10_000, binary=False, inputCol=removerStopWords.getOutputCol(), outputCol="tf_embedding")
    tfidf  = IDF(inputCol=hasher.getOutputCol(), outputCol="embedding")
    
    pipelines[key_lang] = Pipeline(stages=[tokenizer, removerStopWords, hasher, tfidf])

## Preparation data

In [6]:
%%time
courses_with_candidates = {}

for lang in map_languages.keys():
    courses_by_lang = courses.filter(f"lang = '{lang}'")
    preprocessed_courses = pipelines[lang].fit(courses_by_lang).transform(courses_by_lang)
    
    candidates_courses = preprocessed_courses
    for col in candidates_courses.columns:
        candidates_courses = candidates_courses.withColumnRenamed(col, "candidate_" + col)
    
    courses_with_candidates[lang] = preprocessed_courses\
                                    .join(
                                        other=candidates_courses,
                                        how="left",
                                        on=(
                                            (F.col("lang") == F.col("candidate_lang"))
                                            & (F.col("id") != F.col("candidate_id"))
                                        )
                                    )

CPU times: user 141 ms, sys: 54.9 ms, total: 195 ms
Wall time: 11.1 s


## Getting recommendations

In [7]:
@F.udf(returnType=FloatType())
def cosine_similarity(vector1, vector2):
    similarity = float(vector1.dot(vector2) / (vector1.norm(2) * vector2.norm(2)))
    return similarity if pd.notna(similarity) else 0

In [8]:
%%time

result = {}
for (course_id, lang_course) in given_courses:
    relevant_courses = courses_with_candidates[lang_course]\
                       .where(f"id = {course_id}")\
                       .withColumn("similarity", cosine_similarity("embedding", "candidate_embedding"))\
                       .sort(F.col("similarity").desc(), F.col("candidate_name").asc(), F.col("candidate_id").asc())
    
    result[course_id] = relevant_courses

CPU times: user 14.7 ms, sys: 1.3 ms, total: 16 ms
Wall time: 105 ms


In [9]:
num_recommendations = 10
result_for_saving = {}

for course_id, recommendations in tqdm(result.items()):
    candidate_ids = recommendations.limit(num_recommendations).select("candidate_id").collect()
    result_for_saving[f"{course_id}"] = [row["candidate_id"] for row in candidate_ids]

100%|██████████| 6/6 [00:59<00:00,  9.87s/it]


In [10]:
with open("lab02.json", "w") as f:
    json.dump(result_for_saving, f)

## Stopping session

In [11]:
ss.catalog.clearCache()
ss.stop()