# Most popular rating 

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import sql

import pandas as pd

In [62]:
conf = SparkConf().setAppName("MostPopularMovieWithRatingAbove4RDDOnly")
sc = SparkContext(conf=conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by _create_shell_session at /home/blake/anaconda3/pkgs/pyspark-2.4.4-py_0/site-packages/pyspark/python/pyspark/shell.py:41 

## Using lambdas only

### Load Data

In [6]:
path_ratings = r"/home/blake/PycharmProjects/BigDataMovies/the-movies-dataset/ratings_small.csv"
path_movies_metadata = r"/home/blake/PycharmProjects/BigDataMovies/the-movies-dataset/movies_metadata.csv"

ratings = sc.textFile(path_ratings)
movies_metadata = sc.textFile(path_movies_metadata)

### Define useful functions

In [7]:
def header_dropper(rdd: pyspark.rdd.RDD):
    header = rdd.first()
    print(header)
    no_header_rdd = rdd.filter(lambda row: row != header)
    return no_header_rdd

### Drop Headers

In [65]:
ratings = header_dropper(ratings)

1,31,2.5,1260759144


In [66]:
movies_metadata = header_dropper(movies_metadata)

Row(adult='False', belongs_to_collection="{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}", budget='30000000', genres="[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]", homepage='http://toystory.disney.com/toy-story', id='862', imdb_id='tt0114709', original_language='en', original_title='Toy Story', overview="Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.", popularity='21.946943', poster_path='/rhIRbceoE9lR4veEXuwCC2wARtG.jpg', production_companies="[{'name': 'Pixar Animation Studios', 'id': 3}]", production_countries="[{'iso_3166_1': 'US', 'name': 'United States of America'}]", releas

### Analysis

#### Create an RDD with highest ratings with count of ratings above 50

In [67]:
ratings.take(5)

['1,1029,3.0,1260759179',
 '1,1061,3.0,1260759182',
 '1,1129,2.0,1260759185',
 '1,1172,4.0,1260759205',
 '1,1263,2.0,1260759151']

In [68]:
movie_id_and_rating = ratings.map(lambda line: line.split(',')[1:3])
movie_id_and_rating.take(5)

[['1029', '3.0'],
 ['1061', '3.0'],
 ['1129', '2.0'],
 ['1172', '4.0'],
 ['1263', '2.0']]

In [69]:
# Result: movie_id: int, (rating: float, count: int)
movie_id_and_rating_right_formats = movie_id_and_rating.map(
    lambda line: (int(line[0]), (float(line[1]), 1)))
movie_id_and_rating_right_formats.take(6)

[(1029, (3.0, 1)),
 (1061, (3.0, 1)),
 (1129, (2.0, 1)),
 (1172, (4.0, 1)),
 (1263, (2.0, 1)),
 (1287, (2.0, 1))]

In [70]:
movie_id_and_sum_rating_and_sum_count = \
    movie_id_and_rating_right_formats\
        .reduceByKey(lambda row_a, row_b: (row_a[0]+row_b[0], row_a[1]+row_b[1]))
movie_id_and_sum_rating_and_sum_count.collect();

In [71]:
movie_id_and_avg_rating_and_count = \
    movie_id_and_sum_rating_and_sum_count\
        .map(lambda row: (row[0], (row[1][0]/row[1][1], row[1][1])))
movie_id_and_avg_rating_and_count.take(5)

[(1172, (4.260869565217392, 46)),
 (2150, (3.513888888888889, 36)),
 (2294, (3.2735849056603774, 53)),
 (2968, (3.5697674418604652, 43)),
 (10, (3.4508196721311477, 122))]

In [72]:
movie_id_and_avg_rating_and_count_above_50 = \
    movie_id_and_avg_rating_and_count\
        .filter(lambda row: row[1][1] >= 50)
movie_id_and_avg_rating_and_count_above_50.take(5)

[(2294, (3.2735849056603774, 53)),
 (10, (3.4508196721311477, 122)),
 (50, (4.370646766169155, 201)),
 (52, (3.6372549019607843, 51)),
 (62, (3.689655172413793, 87))]

In [73]:
movie_id_and_avg_rating_and_count_above_50_ordered_by_rating = \
    movie_id_and_avg_rating_and_count_above_50\
        .sortBy(lambda row: row[1][0], False)
movie_id_and_avg_rating_and_count_above_50_ordered_by_rating.take(10)

[(858, (4.4875, 200)),
 (318, (4.487138263665595, 311)),
 (969, (4.42, 50)),
 (913, (4.387096774193548, 62)),
 (1221, (4.385185185185185, 135)),
 (50, (4.370646766169155, 201)),
 (1228, (4.35, 50)),
 (1252, (4.3355263157894735, 76)),
 (904, (4.315217391304348, 92)),
 (1203, (4.304054054054054, 74))]

#### Create an RDD movie_id and movie_title

In [74]:
# movie_medadata file is actually quite broken and it'd be hard 
#to read it and clean it in a form of RDD so I will use 
# DF for convenience and transform it to RDD.
movies_metadata = spark.read.csv(
    path=path_movies_metadata,
    header="true",
    inferSchema="true",
#     quote=""
)

movies_metadata = movies_metadata.rdd
type(movies_metadata)

pyspark.rdd.RDD

In [103]:
def get_id_and_title(line):
    movie_id = line['id']
    try:
        movie_id = int(movie_id)
    except (TypeError, ValueError):
        pass
    movie_titile = line['title']
    return movie_id, movie_titile

In [104]:
movide_id_and_title = movies_metadata.map(get_id_and_title)
movide_id_and_title.collect()

[(862, 'Toy Story'),
 (8844, 'Jumanji'),
 (15602, 'Grumpier Old Men'),
 (31357, "[{'iso_639_1': 'en', 'name': 'English'}]"),
 (11862, 'Father of the Bride Part II'),
 (949, 'Heat'),
 (11860, 'Sabrina'),
 (45325, 'Tom and Huck'),
 (9091, 'Sudden Death'),
 (710, 'GoldenEye'),
 (9087, 'The American President'),
 (12110, 'Dracula: Dead and Loving It'),
 (21032, 'Balto'),
 (10858, 'Nixon'),
 (1408, 'Cutthroat Island'),
 (524, 'Casino'),
 (4584, 'Sense and Sensibility'),
 (5, 'Four Rooms'),
 (9273, 'Ace Ventura: When Nature Calls'),
 (11517, 'Money Train'),
 (8012, '105.0'),
 (1710, 'Copycat'),
 (9691, 'Assassins'),
 (12665, 'Powder'),
 (451, 'Leaving Las Vegas'),
 (16420, 'Othello'),
 (9263, 'Now and Then'),
 (17015, 'Persuasion'),
 (902, " {'name': 'Victoires Productions'"),
 (37557, 'Shanghai Triad'),
 (9909, 'Dangerous Minds'),
 (63, 'Twelve Monkeys'),
 (78802, 'Wings of Courage'),
 (9598, 'Babe'),
 (47018, 'Carrington'),
 (687, 'Dead Man Walking'),
 (139405, 'Across the Sea of Time'),
 

#### Join the above RDDs and return the final result

In [105]:
movie_id_and_avg_rating_and_count_above_50_ordered_by_rating.first()

(858, (4.4875, 200))

In [106]:
movide_id_and_title.first()

(862, 'Toy Story')

In [112]:
results = movie_id_and_avg_rating_and_count_above_50_ordered_by_rating\
    .join(movide_id_and_title)

#### RESULTS

In [113]:
results.take(10)

[(858, ((4.4875, 200), 'Sleepless in Seattle')),
 (913, ((4.387096774193548, 62), 'The Thomas Crown Affair')),
 (2959,
  ((4.178217821782178, 202), 'First came love... then came Reverend Frank.')),
 (1089, ((4.162878787878788, 132), 'Point Break')),
 (5995, ((4.131147540983607, 61), 'Miffo')),
 (110, ((3.9451754385964914, 228), 'Three Colors: Red')),
 (924, ((3.886178861788618, 123), 'Dawn of the Dead')),
 (1265, ((3.8393939393939394, 165), 'Bridge to Terabithia')),
 (5060, ((3.7457627118644066, 59), 'Carry On Screaming')),
 (1584, ((3.7282608695652173, 92), 'School of Rock'))]

## More readable version using defined functions