### Alert !!!! dont forget to trim the `string` column before comparing (filtering) as whitespaces is present in both sides .

# All installs

In [78]:
# !pip freeze > requirements.txt

# All imports and setup

In [79]:
from pyspark.sql import SparkSession , functions as F

spark = SparkSession.builder.appName('Netflix')\
    .config('spark.driver.extraClassPath', '/usr/lib/jvm/java-17-openjdk-amd64/lib/postgresql-42.5.0.jar')\
    .getOrCreate()


In [80]:

# from pyspark.sql.types import StructType , StructField , StringType , IntegerType , DateType , DoubleType , FloatType


In [81]:
# # define cusatom schema for      ('Data/netflix_titles.json') date added =August 14, 2020 

# schema=StructType([StructField('show_id',StringType(),True),StructField('type',StringType(),True),StructField('title',StringType(),True),StructField('director',StringType(),True),StructField('cast',StringType(),True),StructField('country',StringType(),True),StructField('date_added',StringType(),True),StructField('release_year',StringType(),True),StructField('rating',StringType(),True),StructField('duration',StringType(),True),StructField('listed_in',StringType(),True),StructField('description',StringType(),True)])


In [82]:

# Netflix_Titles = spark.read.json('Data/netflix_titles.json') 
Netflix_Titles = spark.read.json('Data/netflix_titles.json')

Netflix_Titles.show(5)

+---------------+--------------------+-------------+-----------------+--------------------+-----------------+---------+--------------------+------+------------+-------+-----+-------+
|_corrupt_record|                cast|      country|       date_added|         description|         director| duration|           listed_in|rating|release_year|show_id|title|   type|
+---------------+--------------------+-------------+-----------------+--------------------+-----------------+---------+--------------------+------+------------+-------+-----+-------+
|           null|João Miguel, Bian...|       Brazil|  August 14, 2020|In a future where...|                 |4 Seasons|International TV ...| TV-MA|        2020|     s1|   3%|TV Show|
|           null|Demián Bichir, Hé...|       Mexico|December 23, 2016|After a devastati...|Jorge Michel Grau|   93 min|Dramas, Internati...| TV-MA|        2016|     s2| 7:19|  Movie|
|           null|Tedd Chan, Stella...|    Singapore|December 20, 2018|When an army re

In [83]:

Netflix_Titles.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- description: string (nullable = true)
 |-- director: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- show_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)



# 1. How many PG-13 titles are there?

In [84]:
# rating column  contains Rating for films and one of them is PG-13
PG_13= Netflix_Titles.select('title','rating').filter(Netflix_Titles.rating=='PG-13')

PG_13.show(5)

print('count of PG-13 rating',PG_13.count())

############ CSV ############
PG_13.coalesce(1).write.csv('Output/Netflix/PG_13',header=True,mode='overwrite')

# ############ Postgres ############
PG_13.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver',  dbtable='PG_13', user='amrit', password='1234').mode('overwrite').save()
# mode(overwrite) drop old table from database and creates a new table 'directors_count_no_empty'
#save() is action and it will execute the query

+-----------+------+
|      title|rating|
+-----------+------+
|          9| PG-13|
|         21| PG-13|
|   Æon Flux| PG-13|
|10,000 B.C.| PG-13|
|  16 Blocks| PG-13|
+-----------+------+
only showing top 5 rows

count of PG-13 rating 386


# 2. How many titles an actor or actress appeared in?


In [85]:

Only_cast= Netflix_Titles.select(Netflix_Titles['cast']).withColumn('cast', F.explode(F.split('cast', ','))) # new data frame after  split items in  cast column and then  expand/explode items in cast columns into rows  

Only_cast=Only_cast.withColumn('cast',F.trim(Only_cast['cast']))

cast_count= Only_cast.groupBy('cast').count().orderBy('count', ascending=False) # groupby 'cast' and count the number of times each actor appeared in the data frame

cast_count.show(5) # count empty values too 

cast_count_no_empty= cast_count.filter(cast_count['cast'] != '') #  empty values in cast column are removed 

cast_count_no_empty.show(5)  # empty values  not counted

# # oneliner
# Netflix_Titles.select(Netflix_Titles['cast']).withColumn('cast', F.explode(F.split('cast', ','))).groupBy('cast').count().orderBy('count', ascending=False).show(5)  # count empty values too 


############ CSV ############
cast_count_no_empty.coalesce(1).write.csv('Output/Netflix/Cast_count',header=True,mode='overwrite')

# # ############ Postgres ############
cast_count_no_empty.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver',  dbtable='Cast_count', user='amrit', password='1234').mode('overwrite').save()

+----------------+-----+
|            cast|count|
+----------------+-----+
|                |  716|
|     Anupam Kher|   41|
|  Shah Rukh Khan|   35|
|Naseeruddin Shah|   30|
|         Om Puri|   30|
+----------------+-----+
only showing top 5 rows

+----------------+-----+
|            cast|count|
+----------------+-----+
|     Anupam Kher|   41|
|  Shah Rukh Khan|   35|
|         Om Puri|   30|
|Naseeruddin Shah|   30|
|    Akshay Kumar|   29|
+----------------+-----+
only showing top 5 rows



# 3. How many titles has a director has filmed?

In [86]:
#since multiple directors are separated by comma, we need to split them and then explode them into rows
Only_directors= Netflix_Titles.select(Netflix_Titles['director']).withColumn('director', F.explode(F.split('director', ','))) 

Only_directors=Only_directors.withColumn('director',F.trim(Only_directors['director']))

directors_count= Only_directors.groupBy('director').count().orderBy('count', ascending=False) 

# directors_count.show(5) 

directors_count_no_empty= directors_count.filter(directors_count['director'] != '') #  empty values in director column are removed

directors_count_no_empty.show(5)  # empty values  not counted
directors_count_no_empty.count() 


# # oneliner
# Netflix_Titles.select(Netflix_Titles['director']).withColumn('director', F.explode(F.split('director', ','))).groupBy('director').count().orderBy('count', ascending=False).show(5)  # count empty values too


############ CSV ############
directors_count_no_empty.coalesce(1).write.csv('Output/Netflix/Directors_count',header=True,mode='overwrite')

# # ############ Postgres ############
directors_count_no_empty.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver',  dbtable='Directors_count', user='amrit', password='1234').mode('overwrite').save()

+-------------------+-----+
|           director|count|
+-------------------+-----+
|          Jan Suter|   21|
|        Raúl Campos|   19|
|       Marcus Raboy|   16|
|          Jay Karas|   15|
|Cathy Garcia-Molina|   13|
+-------------------+-----+
only showing top 5 rows



                                                                                

# 4. What content is available in different countries?


In [87]:
Only_genre_country= Netflix_Titles.select(Netflix_Titles['country'], Netflix_Titles['listed_in']).withColumn('country', F.explode(F.split('country', ','))).withColumn('listed_in', F.explode(F.split('listed_in', ',')))  # here  both country aand listed_in needed to be exploded 

Only_genre_country.show(5)


# trim both country and listed_in columns
Only_genre_country=Only_genre_country.withColumn('country',F.trim(Only_genre_country['country']))
Only_genre_country=Only_genre_country.withColumn('listed_in',F.trim(Only_genre_country['listed_in']))

# remove empty values in  listed_in  and country column
Only_genre_country= Only_genre_country.filter(Only_genre_country['listed_in'] != '')
Only_genre_country= Only_genre_country.filter(Only_genre_country['country'] != '')

country_genre_combined=Only_genre_country.groupBy('country').agg(F.concat_ws(',', F.collect_set('listed_in')).alias('Genre')).sort('country') # groupby country and then combine all the genres in a single column

country_genre_combined.show(5)

############ CSV ############
country_genre_combined.coalesce(1).write.csv('Output/Netflix/Country_genre_combined',header=True,mode='overwrite')

# # ############ Postgres ############
country_genre_combined.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver',  dbtable='Country_genre_combined', user='amrit', password='1234').mode('overwrite').save()


+-------+--------------------+
|country|           listed_in|
+-------+--------------------+
| Brazil|International TV ...|
| Brazil|           TV Dramas|
| Brazil| TV Sci-Fi & Fantasy|
| Mexico|              Dramas|
| Mexico| International Mo...|
+-------+--------------------+
only showing top 5 rows

+-----------+--------------------+
|    country|               Genre|
+-----------+--------------------+
|Afghanistan|Documentaries,Int...|
|    Albania|Dramas,Internatio...|
|    Algeria|Classic Movies,Dr...|
|     Angola|Action & Adventur...|
|  Argentina|Crime TV Shows,Do...|
+-----------+--------------------+
only showing top 5 rows



In [88]:
# access posstgres table using pyspark and verify the data
country_genre_combined1234=spark.read.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver', dbtable='country_genre_combined', user='amrit', password='1234').load()
country_genre_combined1234.show()


+--------------+--------------------+
|       country|               Genre|
+--------------+--------------------+
|   Afghanistan|Documentaries,Int...|
|       Albania|Dramas,Internatio...|
|       Algeria|Classic Movies,Dr...|
|        Angola|Action & Adventur...|
|     Argentina|Crime TV Shows,Do...|
|       Armenia|Documentaries,Int...|
|     Australia|Crime TV Shows,Do...|
|       Austria|Crime TV Shows,Do...|
|    Azerbaijan|International TV ...|
|       Bahamas|  Action & Adventure|
|    Bangladesh|Independent Movie...|
|       Belarus|British TV Shows,...|
|       Belgium|Crime TV Shows,Do...|
|       Bermuda|       Documentaries|
|      Botswana|       Documentaries|
|        Brazil|Faith & Spiritual...|
|      Bulgaria|Horror Movies,Chi...|
|      Cambodia|Children & Family...|
|        Canada|Crime TV Shows,Do...|
|Cayman Islands|Children & Family...|
+--------------+--------------------+
only showing top 20 rows



# 5. How many movies were released in 2008?


In [89]:
movies_2008=Netflix_Titles.select(Netflix_Titles['type'],Netflix_Titles['title'],Netflix_Titles['release_year']).filter((Netflix_Titles['release_year'] == '2008') &( Netflix_Titles['type'] == 'Movie'))

movies_2008.show(5)

print('count:  Movies released in 2008:  ', movies_2008.count())

############ CSV ############
movies_2008.coalesce(1).write.csv('Output/Netflix/Movies_2008',header=True,mode='overwrite')

# # ############ Postgres ############
movies_2008.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver',  dbtable='Movies_2008', user='amrit', password='1234').mode('overwrite').save()


+-----+-------------------+------------+
| type|              title|release_year|
+-----+-------------------+------------+
|Movie|                 21|        2008|
|Movie|               1920|        2008|
|Movie|        10,000 B.C.|        2008|
|Movie|   2 Alone in Paris|        2008|
|Movie|A Very Special Love|        2008|
+-----+-------------------+------------+
only showing top 5 rows

count:  Movies released in 2008:   102


# 6. List all the movies whose duration is greater than 100 mins ?


In [90]:
#  remove min from duration column and convert(cast) to integer
Netflix_Titles=Netflix_Titles.withColumn('duration', F.regexp_replace('duration', 'min', '')).withColumn('duration', F.col('duration').cast('int'))


##### Below code will result in error  because there  duration column  contain  min and season as well so.. as we  only replace min with empty string, season will remain as it is and during conversion  it will result in error #############

# def remove_min(duration):
#     return int(duration.replace('min', ''))
# remove_min_udf= F.udf(remove_min, IntegerType())

# Netflix_Titles=Netflix_Titles.withColumn('duration', remove_min_udf('duration'))

movies_100=Netflix_Titles.select(Netflix_Titles['type'],Netflix_Titles['title'],Netflix_Titles['duration']).filter((Netflix_Titles['duration'] > 100) &( Netflix_Titles['type'] == 'Movie')) .sort('duration', ascending=True)

movies_100.show(5)

print('count: movies with duration more than 100 :' ,movies_100.count())


############ CSV ############
movies_100.coalesce(1).write.csv('Output/Netflix/Movies_100',header=True,mode='overwrite')

# # ############ Postgres ############
movies_100.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver',  dbtable='Movies_100', user='amrit', password='1234').mode('overwrite').save()



+-----+--------------------+--------+
| type|               title|duration|
+-----+--------------------+--------+
|Movie|     Asphalt Burning|     101|
|Movie|Zack and Miri Mak...|     101|
|Movie|              A Choo|     101|
|Movie|XXX: State of the...|     101|
|Movie|           Woodshock|     101|
+-----+--------------------+--------+
only showing top 5 rows

count: movies with duration more than 100 : 2430


# 7. List movies played by “Kareena Kapoor” ?

In [91]:
title_cast=Netflix_Titles.select(Netflix_Titles['title'],Netflix_Titles['cast']).withColumn('cast', F.explode(F.split('cast', ',')))

# remove empty values in cast column
title_cast=title_cast.filter(title_cast['cast'] != '')

# trim white spaces in cast column  as some values have white spaces and shows inaccuracy in results 
title_cast=title_cast.withColumn('cast', F.trim(title_cast['cast']))

Kareena_kapoor=title_cast.filter(title_cast['cast'] == 'Kareena Kapoor')

# Kareena_kapoor.show(5)

print(' Count :: Movies played by Kareena Kapoor are: ',Kareena_kapoor.count())

####### CSV file  ############
Kareena_kapoor.coalesce(1).write.csv('Output/Netflix/Kareena_kapoor', header=True, mode='overwrite')

############ Postgres ############
Kareena_kapoor.write.format('jdbc').options( url='jdbc:postgresql://127.0.0.1/Netflix_titles', driver='org.postgresql.Driver', dbtable='Kareena_kapoor', user='amrit', password='1234').mode('overwrite').save()


 Count :: Movies played by Kareena Kapoor are:  25
