# All installs

In [1]:
# !pip freeze > requirements.txt

# All imports and setup

In [2]:
from pyspark.sql import SparkSession , functions as F

spark = SparkSession.builder.appName('Netflix')\
    .config('spark.driver.extraClassPath', '/usr/lib/jvm/java-17-openjdk-amd64/lib/postgresql-42.5.0.jar')\
    .getOrCreate()


22/10/19 21:12:09 WARN Utils: Your hostname, AMRIT resolves to a loopback address: 127.0.1.1; using 192.168.95.69 instead (on interface eth0)
22/10/19 21:12:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/19 21:12:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:

Netflix_Titles = spark.read.json('Data/netflix_titles.json')    


Netflix_Titles.show(5)

                                                                                

+---------------+--------------------+-------------+-----------------+--------------------+-----------------+---------+--------------------+------+------------+-------+-----+-------+
|_corrupt_record|                cast|      country|       date_added|         description|         director| duration|           listed_in|rating|release_year|show_id|title|   type|
+---------------+--------------------+-------------+-----------------+--------------------+-----------------+---------+--------------------+------+------------+-------+-----+-------+
|           null|João Miguel, Bian...|       Brazil|  August 14, 2020|In a future where...|                 |4 Seasons|International TV ...| TV-MA|        2020|     s1|   3%|TV Show|
|           null|Demián Bichir, Hé...|       Mexico|December 23, 2016|After a devastati...|Jorge Michel Grau|   93 min|Dramas, Internati...| TV-MA|        2016|     s2| 7:19|  Movie|
|           null|Tedd Chan, Stella...|    Singapore|December 20, 2018|When an army re

In [4]:

Netflix_Titles.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- description: string (nullable = true)
 |-- director: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- show_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)



# 1. How many PG-13 titles are there?

In [5]:
# rating column  contains Rating for films and one of them is PG-13
Netflix_Titles.filter(Netflix_Titles['rating'] == 'PG-13').count() # here filter is transformation and count is action

386

# 2. How many titles an actor or actress appeared in?


In [6]:

Only_cast= Netflix_Titles.select(Netflix_Titles['cast']).withColumn('cast', F.explode(F.split('cast', ','))) # new data frame after  split items in  cast column and then  expand/explode items in cast columns into rows  

cast_count= Only_cast.groupBy('cast').count().orderBy('count', ascending=False) # groupby 'cast' and count the number of times each actor appeared in the data frame

cast_count.show(5) # count empty values too 

cast_count_no_empty= cast_count.filter(cast_count['cast'] != '') #  empty values in cast column are removed 

cast_count_no_empty.show(5)  # empty values  not counted

# # oneliner
# Netflix_Titles.select(Netflix_Titles['cast']).withColumn('cast', F.explode(F.split('cast', ','))).groupBy('cast').count().orderBy('count', ascending=False).show(5)  # count empty values too 


 

+-----------------+-----+
|             cast|count|
+-----------------+-----+
|                 |  716|
|      Anupam Kher|   37|
| Takahiro Sakurai|   27|
|          Om Puri|   27|
|   Shah Rukh Khan|   27|
+-----------------+-----+
only showing top 5 rows

+-----------------+-----+
|             cast|count|
+-----------------+-----+
|      Anupam Kher|   37|
|          Om Puri|   27|
|   Shah Rukh Khan|   27|
| Takahiro Sakurai|   27|
|      Boman Irani|   25|
+-----------------+-----+
only showing top 5 rows



# 3. How many titles has a director has filmed?

In [7]:
#since multiple directors are separated by comma, we need to split them and then explode them into rows

Only_directors= Netflix_Titles.select(Netflix_Titles['director']).withColumn('director', F.explode(F.split('director', ','))) 

directors_count= Only_directors.groupBy('director').count().orderBy('count', ascending=False) 

# directors_count.show(5) 

directors_count_no_empty= directors_count.filter(directors_count['director'] != '') #  empty values in director column are removed

directors_count_no_empty.show(5)  # empty values  not counted



# # oneliner
# Netflix_Titles.select(Netflix_Titles['director']).withColumn('director', F.explode(F.split('director', ','))).groupBy('director').count().orderBy('count', ascending=False).show(5)  # count empty values too

+-------------------+-----+
|           director|count|
+-------------------+-----+
|        Raúl Campos|   18|
|          Jan Suter|   18|
|       Marcus Raboy|   16|
|          Jay Karas|   15|
|Cathy Garcia-Molina|   13|
+-------------------+-----+
only showing top 5 rows



In [8]:
# save result directors_count_no_empty to postgres database with driver

directors_count_no_empty.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1:5432/postgres', driver='org.postgresql.Driver', dbtable='directors_count_no_empty', user='amrit', password='4532').mode('overwrite').save()

# # https://jdbc.postgresql.org/documentation/head/connect.html
# directors_count_no_empty.write.jdbc(url='jdbc:postgresql://127.0.0.1:5432/postgres', table='directors_count', mode='overwrite', properties={'user': 'amrit', 'password': '4532'}


# 4. What content is available in different countries?


In [9]:
# similar to above split and explode  "listed_in" column 


# 5. How many movies were released in 2008?


# 6. List all the movies whose duration is greater than 100 mins ?


# 7. List movies played by “Kareena Kapoor” ?