<a href="https://colab.research.google.com/github/anslava00/PySpark_sber/blob/main/PySpark_lesson1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re 
import string
from joblib import Parallel, delayed
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
with open('user_ratedmovies.dat', 'rb') as f:
    data = f.readlines()
headers = data[0].decode().split('\t')[:3]
data = [row.decode().split('\t')[:3] for row in data[1:]]

In [None]:
batch_size = 1000
n_nodes = 5
def my_map_rating(rows):
    return [(row[1], 1) for row in rows if float(row[2]) > 4]


def my_create_chunks(shuffled_data):
    result = {}
    for idx, data in shuffled_data:
        if idx in result:
            result[idx].append(data)
        else:
            result[idx] = [data]
    return list(result.items())

def my_shuffle_rating(mapper_result, n_nodes=5):
    shuffled_data = []
    for key, value in mapper_result:
        shuffled_data.append((hash(key)%n_nodes, (key, value)))
    shuffled_data = sorted(shuffled_data, key=lambda x: x[0])
    chunks = my_create_chunks(shuffled_data)
    return chunks
    
def reduce_rating(map_row):
    result = {}
    for key, value in map_row:
        if key in result:
            result[key] += 1
        else:
            result[key] = 1
    return result

def map_shuffle(row, n_nodes):
    map_result = my_map_rating(row)
    shuffle_result = my_shuffle_rating(map_result, n_nodes)
    return shuffle_result

In [None]:
with Parallel(n_jobs=n_nodes, verbose=10) as parallel:
    data_size = len(data)
    res = parallel(delayed(map_shuffle)(
        data[i: i + batch_size if data_size - i >= batch_size else data_size - 1], 
        n_nodes) for i in range(0, data_size, batch_size))

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    1.1s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    1.1s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    1.2s
[Parallel(n_jobs=5)]: Batch computation too fast (0.1871s.) Setting batch_size=2.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    1.2s
[Parallel(n_jobs=5)]: Done  32 tasks      | elapsed:    1.3s
[Parallel(n_jobs=5)]: Batch computation too fast (0.0588s.) Setting batch_size=4.
[Parallel(n_jobs=5)]: Done  50 tasks      | elapsed:    1.3s
[Parallel(n_jobs=5)]: Batch computation too fast (0.0418s.) Setting batch_size=8.
[Parallel(n_jobs=5)]: Done  98 tasks      | elapsed:    1.4s
[Parallel(n_jobs=5)]: Batch computation too fast (0.1115s.) Setting batch_size=16.
[Parallel(n_jobs=5)]: Batch computation too fast (0.1412s.) Setting batch_size=32.
[Parallel(n_jobs=5)]: Done 202 tasks      | elapsed:    1.6s
[Parallel(n_jobs=5)]: Don

In [None]:
for re in res[0]:
  print(re, end='\n\n')

(0, [('32', 1), ('996', 1), ('2571', 1), ('2959', 1), ('32', 1), ('912', 1), ('969', 1), ('994', 1), ('1077', 1), ('1089', 1), ('1188', 1), ('1197', 1), ('1208', 1), ('1211', 1), ('1243', 1), ('1253', 1), ('1265', 1), ('1288', 1), ('1301', 1), ('1391', 1), ('1629', 1), ('1913', 1), ('1916', 1), ('1921', 1), ('1931', 1), ('1965', 1), ('2064', 1), ('2174', 1), ('2290', 1), ('2291', 1), ('2359', 1), ('2455', 1), ('2571', 1), ('2692', 1), ('2919', 1), ('2973', 1), ('3104', 1), ('3201', 1), ('3271', 1), ('3470', 1), ('3676', 1), ('3814', 1), ('3925', 1), ('3967', 1), ('4338', 1), ('4407', 1), ('4467', 1), ('4641', 1), ('4918', 1), ('5198', 1), ('5288', 1), ('5747', 1), ('5881', 1), ('6678', 1), ('7020', 1), ('7123', 1), ('7361', 1), ('9005', 1), ('51662', 1), ('2959', 1), ('104', 1), ('307', 1), ('480', 1), ('593', 1), ('595', 1), ('903', 1), ('912', 1), ('1089', 1), ('1101', 1), ('1197', 1), ('1201', 1), ('1208', 1), ('1288', 1), ('1610', 1), ('1757', 1), ('1913', 1), ('1921', 1), ('2291',

In [None]:
shuffle_stage = {i:[] for i in range(5)}
for values in res:
    values = dict(values)
    for key in values.keys():
        shuffle_stage[key].extend(values[key])

In [None]:
for key in shuffle_stage.keys():
    print(f'{key}: number of words = {len(shuffle_stage[key])}')

0: number of words = 42624
1: number of words = 43393
2: number of words = 47261
3: number of words = 44621
4: number of words = 46101


In [None]:
with Parallel(n_jobs=n_nodes, verbose=10, batch_size=5) as parallel:
    res = parallel(delayed(reduce_rating)(shuffle_stage[key]) for key in shuffle_stage.keys())

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    1.0s remaining:    1.4s
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed:    1.1s remaining:    0.7s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.2s finished


In [None]:
len(res)

5

In [None]:
result = {}
for partition in res:
    for key in partition.keys():
        if key in result:
            result[key] += partition[key]
        else:
            result[key] = partition[key]

Проверка

In [None]:
temp = pd.DataFrame(data, columns=('one', 'two', 'tree'))
temp = temp.astype(float)
temp[(temp.two == 2571) & (temp.tree > 4)].shape

(1258, 3)

**("Id_film", "count_rating")**

In [None]:
sorted(result.items(), key=lambda x: x[1], reverse=True)

[('2571', 1258),
 ('318', 1239),
 ('296', 1208),
 ('2959', 1136),
 ('4993', 1059),
 ('7153', 998),
 ('5952', 974),
 ('858', 965),
 ('50', 946),
 ('2858', 939),
 ('4226', 890),
 ('593', 853),
 ('260', 847),
 ('1196', 827),
 ('4973', 818),
 ('1198', 816),
 ('356', 807),
 ('527', 782),
 ('7361', 767),
 ('1136', 755),
 ('47', 741),
 ('2762', 715),
 ('1193', 673),
 ('1221', 667),
 ('608', 640),
 ('541', 637),
 ('2028', 625),
 ('6874', 617),
 ('750', 617),
 ('32', 608),
 ('32587', 607),
 ('1089', 599),
 ('2329', 589),
 ('3578', 574),
 ('110', 573),
 ('8961', 570),
 ('1210', 563),
 ('4878', 561),
 ('589', 559),
 ('1206', 558),
 ('912', 544),
 ('1197', 543),
 ('1270', 543),
 ('1213', 543),
 ('1214', 540),
 ('6016', 531),
 ('7438', 529),
 ('4306', 529),
 ('1208', 527),
 ('1258', 523),
 ('6377', 521),
 ('6539', 515),
 ('4995', 512),
 ('3996', 511),
 ('1291', 505),
 ('293', 504),
 ('6711', 503),
 ('2997', 496),
 ('1265', 495),
 ('1704', 495),
 ('924', 493),
 ('33794', 490),
 ('1080', 475),
 ('402