# 0. Importing PySpark

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext()
spark = SparkSession(sc)
# run this cell only once

# 1. Loading the dataset

In [3]:
import csv 

In [4]:
rdd1 = sc.textFile("digikala_comments.csv")\
        .mapPartitions(lambda line: csv.reader(line, delimiter=',', quotechar='"'))\
        .filter(lambda line: len(line) >= 2 and line[0] != 'product_id')

In [27]:
rdd1.take(1)

[['3692',
  'ماوس بی\u200cسیم لاجیتک مدل M325',
  'IT',
  '989472',
  '0',
  '0',
  'verified',
  '\\N',
  '',
  'واقعا عالیه. من که ازش خیلی راضیم',
  '',
  '']]

# 2. Most popular item

In [44]:
rdd1.map(lambda l: (l[0], int(l[4]) - int(l[5])))\
    .reduceByKey(lambda v1, v2: v1 + v2)\
    .max(key = lambda x: x[1])

('700304', 368)

# 3. Percentage of unverified comments

In [5]:
total_count = rdd1.count()

In [6]:
not_verified_count = rdd1.filter(lambda l: l[6] != 'verified').count()

In [7]:
print((not_verified_count / total_count ) * 100)

1.7600964436407474


# 4. The largest word in the comment section

In [28]:
rdd1.map(lambda l: l[9].split())\
    .flatMap(lambda x: x)\
    .map(lambda x: (x, len(x)))\
    .reduce(lambda w1, w2: w1 if w1[1] > w2[1] else w2)

('بذارید_x000D__x000D_---------------------------------------------------------------_x000D__x000D_یکی',
 100)

# 5. Top 10 words in advantages and disadvantages

In [31]:
import ast

In [21]:
persian_chars=["آ", "ا", "ب", "پ", "ت", "ث", "ج", "چ", "ح", "خ", "د", "ذ", "ر", "ز", "ژ", "س", "ش",
                  "ص", "ض", "ط", "ظ", "ع", "غ", "ف", "ق", "ک" ,"گ", "ل", "م", "ن", "و" ,"ه", "ی"]

In [109]:
def preprocess_words(x):
    for c in x:
        if c not in persian_chars:
            x = x.replace(c, " ")
    return x

In [133]:
def get_top_10_words(column_num):
    return rdd1.filter(lambda l: l[column_num] != '')\
                .map(lambda l: l[column_num])\
                .map(preprocess_words)\
                .map(lambda x: x.split())\
                .flatMap(lambda x: x)\
                .map(lambda x: (x, 1))\
                .reduceByKey(lambda x, y: x + y)\
                .takeOrdered(10, key=lambda x: -x[1])

Advantages column:

In [134]:
get_top_10_words(10)

[('و', 1402),
 ('کیفیت', 807),
 ('خوب', 798),
 ('عالی', 665),
 ('مناسب', 486),
 ('بالا', 402),
 ('زیبا', 394),
 ('قیمت', 359),
 ('به', 331),
 ('با', 330)]

Disadvantages column:

In [135]:
get_top_10_words(11)

[('و', 486),
 ('ندارد', 402),
 ('کیفیت', 282),
 ('به', 258),
 ('در', 254),
 ('از', 245),
 ('ضعیف', 215),
 ('کم', 212),
 ('نداره', 204),
 ('خیلی', 173)]

# 6. Most popular character in product's title

In [136]:
rdd1.map(lambda l: l[1])\
    .flatMap(lambda x: x)\
    .filter(lambda x: x in persian_chars)\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x, y: x + y)\
    .takeOrdered(1, key=lambda x: -x[1])

[('ی', 24138)]