# Analytical Queries:


In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring, col

In [3]:
home_dir = "/Users/jhansiboda/Desktop/MovieAnalytics/Movie-Analytics-project/MovieLens Dataset/"

In [4]:
print(home_dir)

/Users/jhansiboda/Desktop/MovieAnalytics/Movie-Analytics-project/MovieLens Dataset/


In [5]:
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/20 21:37:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
movies = spark.read.option("header", "False").option("delimiter","::").option("inferSchema","true").csv(home_dir + "movies.csv")

In [7]:
print(movies)

DataFrame[_c0: int, _c1: string, _c2: string]


In [8]:
movies.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [9]:
movies.dtypes

[('_c0', 'int'), ('_c1', 'string'), ('_c2', 'string')]

In [10]:
movies.show(10)

+---+--------------------+--------------------+
|_c0|                 _c1|                 _c2|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Animation|Childre...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|        Comedy|Drama|
|  5|Father of the Bri...|              Comedy|
|  6|         Heat (1995)|Action|Crime|Thri...|
|  7|      Sabrina (1995)|      Comedy|Romance|
|  8| Tom and Huck (1995)|Adventure|Children's|
|  9| Sudden Death (1995)|              Action|
| 10|    GoldenEye (1995)|Action|Adventure|...|
+---+--------------------+--------------------+
only showing top 10 rows



In [11]:
movies.count()

3883

In [12]:
column_names = ["movie_id","title","genres"]
df_movie = movies.toDF(*column_names)

# Create an accumulator for counting
count_accumulator = spark.sparkContext.accumulator(0)

In [13]:

# Find number of movies released between year 1920 to 1930

df_movie_with_year = df_movie.withColumn("year", substring('title', -5, 4).cast("int"))

df_movie_with_year.filter(col("year")== 1920).show()
df_movie_with_year.foreach(lambda row: count_accumulator.add(1) if row.year == 1920 else None)

v = count_accumulator.value
print(v)

+--------+--------------------+------+----+
|movie_id|               title|genres|year|
+--------+--------------------+------+----+
|    3231| Saphead, The (1920)|Comedy|1920|
|    3309|Dog's Life, A (1920)|Comedy|1920|
+--------+--------------------+------+----+



[Stage 7:>                                                          (0 + 1) / 1]

2


                                                                                

# 1) What are the top 10 most viewed movies?

In [17]:
# 1) What are the top 10 most viewed movies?
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
# Using RDD:
rdd = sc.textFile(home_dir+"ratings.csv")
rdd2 = rdd.map(lambda x: x.split("::"))
rdd3 = rdd2.map(lambda x: x[1],1)
rdd3.take(1)
                                                                     
rdd3 = rdd2.map(lambda x: (x[1],1))
rdd3.take(1)
                                                                   

rdd4 = rdd3.reduceByKey(lambda x,y : x+y)
rdd4.take(1)
                                                               
rdd5 = rdd4.sortBy(lambda x: x[1], ascending = False)
rdd5.take(10)

                                                                                

[('2858', 3428),
 ('260', 2991),
 ('1196', 2990),
 ('1210', 2883),
 ('480', 2672),
 ('2028', 2653),
 ('589', 2649),
 ('2571', 2590),
 ('1270', 2583),
 ('593', 2578)]

# 2. What are the distinct list of genres available

In [18]:
# What are the distinct list of genres available

rdd2 = rdd.map(lambda line: line.split("::"))
rdd2.first()

['1', '1193', '5', '978300760']

In [19]:
genres = rdd2.flatMap(lambda line: line[2].split("|")).distinct()
genres.collect()


                                                                                

['4', '1', '5', '3', '2']

In [20]:
# Using DF:
genres = df_movie.select(explode(split("genres", "\\|")).alias("Movie_geners")).distinct()
genres.show()

+------------+
|Movie_geners|
+------------+
|       Crime|
|     Romance|
|    Thriller|
|   Adventure|
|  Children's|
|       Drama|
|         War|
| Documentary|
|     Fantasy|
|     Mystery|
|     Musical|
|   Animation|
|   Film-Noir|
|      Horror|
|     Western|
|      Comedy|
|      Action|
|      Sci-Fi|
+------------+



# 3. How many movies for each genre?

In [28]:
from operator import add
rdd = sc.textFile(home_dir+"movies.csv")
rdd2 = rdd.map(lambda line: line.split("::"))
genres = rdd2.flatMap(lambda line: line[2].split("|"))
rdd3 = genres.map(lambda row: (row,1))
rdd5 = rdd3.sortByKey()
rdd5.collect()
# [('Action', 503), ('Adventure', 283), ('Animation', 105), ("Children's", 251), ('Comedy', 1200), ('Crime', 211), ('Documentary', 127), ('Drama', 1603), ('Fantasy', 68), ('Film-Noir', 44), ('Horror', 343), ('Musical', 114), ('Mystery', 106), ('Romance', 471), ('Sci-Fi', 276), ('Thriller', 492), ('War', 143), ('Western', 68)]




# Using DF:

all_genres = df_movie.select(explode(split("genres", "\\|")).alias('genres'))

all_genres.groupBy('genres').agg(count('genres').alias("Total_movies")).show()

+-----------+------------+
|     genres|Total_movies|
+-----------+------------+
|      Crime|         211|
|    Romance|         471|
|   Thriller|         492|
|  Adventure|         283|
| Children's|         251|
|      Drama|        1603|
|        War|         143|
|Documentary|         127|
|    Fantasy|          68|
|    Mystery|         106|
|    Musical|         114|
|  Animation|         105|
|  Film-Noir|          44|
|     Horror|         343|
|    Western|          68|
|     Comedy|        1200|
|     Action|         503|
|     Sci-Fi|         276|
+-----------+------------+



# 4. How many movies are starting with numbers or letters (Example: Starting with 1/2/3../A/B/C..Z)?


In [25]:
rdd = sc.textFile(home_dir+"movies.csv")
rdd2 = rdd.map(lambda line: line.split("::"))
movie_name = rdd2.filter(lambda x: x[1][0].isdigit())
movie_name.first()

movie_count = rdd2.filter(lambda x: x[1][0].isdigit() or x[1][0].isalpha()).count()
print(movie_count)




# Using DF to find movies starting with number 1:

df_movie.filter(col('title').startswith('1')).distinct().show()


3878
+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|    2572|10 Things I Hate ...|      Comedy|Romance|
|    1367|101 Dalmatians (1...|   Children's|Comedy|
|     889|        1-900 (1994)|             Romance|
|    2085|101 Dalmatians (1...|Animation|Children's|
|    1609|          187 (1997)|               Drama|
|    2826|13th Warrior, The...|Action|Horror|Thr...|
|    1203| 12 Angry Men (1957)|               Drama|
+--------+--------------------+--------------------+



# 5. List the latest released movies

In [27]:
# Using RDD:

rdd = sc.textFile(home_dir+"movies.csv")
rdd2 = rdd.map(lambda line: line.split("::"))
rdd3_year =  rdd2.map(lambda x: (x[0],x[1],x[2], int(x[1][-5:-1])))
rdd3_year.first()

rdd4_sort = rdd3_year.sortBy(lambda x: x[3], ascending = False)
rdd4_sort.take(10)




# Using DF:

from pyspark.sql.functions import substring
df_movie_new = df_movie.withColumn('year', substring(df_movie['title'], -5,4).cast("int"))
df_movie_new.show()

max_year = df_movie_new.select(max(col("year"))).first()[0]
print(max_year)

df_movie_new.filter(col("year") == max_year).show()


df_movie_new.filter(col("year") == max_year).count()



+--------+--------------------+--------------------+----+
|movie_id|               title|              genres|year|
+--------+--------------------+--------------------+----+
|       1|    Toy Story (1995)|Animation|Childre...|1995|
|       2|      Jumanji (1995)|Adventure|Childre...|1995|
|       3|Grumpier Old Men ...|      Comedy|Romance|1995|
|       4|Waiting to Exhale...|        Comedy|Drama|1995|
|       5|Father of the Bri...|              Comedy|1995|
|       6|         Heat (1995)|Action|Crime|Thri...|1995|
|       7|      Sabrina (1995)|      Comedy|Romance|1995|
|       8| Tom and Huck (1995)|Adventure|Children's|1995|
|       9| Sudden Death (1995)|              Action|1995|
|      10|    GoldenEye (1995)|Action|Adventure|...|1995|
|      11|American Presiden...|Comedy|Drama|Romance|1995|
|      12|Dracula: Dead and...|       Comedy|Horror|1995|
|      13|        Balto (1995)|Animation|Children's|1995|
|      14|        Nixon (1995)|               Drama|1995|
|      15|Cutt

156