In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark = (SparkSession
    .builder
    .appName("Netflix Data Analysis")
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext

In [3]:
Netflix_Data = spark.read.options(header='True',inferSchema='True') \
  .csv("netflix-titles.csv")

In [4]:
Netflix_Data.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                null|       United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                null|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglan

In [4]:
# transfering the data frame to have a temporary table
Netflix_Data.createOrReplaceTempView("netflix_table")

In [6]:
## sorting the netflix data alphabitcally 
sorted_netflix = spark.sql("SELECT * FROM netflix_table ORDER BY title")


In [28]:
sorted_netflix_pandas = sorted_netflix.toPandas() ## transforing the sorted netflix data into pandas

In [29]:
sorted_netflix_pandas ## show the sorted netflix data 

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,"and probably will.""",,,,,,,,,,,
1,"Flying Fortress""",William Wyler,,United States,"March 31, 2017",1944,TV-PG,40 min,"Classic Movies, Documentaries",This documentary centers on the crew of the B-...,,
2,s6274,Movie,"""Behind """"The Cove"""": The Quiet Japanese Speak...",Keiko Yagi,,"Japan, United States","August 25, 2017",2015,TV-14,105 min,"Documentaries, International Movies",After a documentary about the Japanese whaling...
3,s6705,Movie,"""Escape from the """"Liberty"""" Cinema""",Wojciech Marczewski,"Janusz Gajos, Zbigniew Zamachowski, Teresa Mar...",Poland,"October 1, 2019",1990,TV-MA,88 min,"Comedies, Dramas, Independent Movies",Artistic rebellion ignites at the movies when ...
4,s4154,Movie,"""Gabriel """"Fluffy"""" Iglesias: One Show Fits All""",Manny Rodriguez,Gabriel Iglesias,,"January 29, 2019",2019,TV-14,91 min,Stand-Up Comedy,"""Gabriel """"Fluffy"""" Iglesias discusses his tee..."
...,...,...,...,...,...,...,...,...,...,...,...,...
8804,s6178,TV Show,忍者ハットリくん,,,Japan,"December 23, 2018",2012,TV-Y7,2 Seasons,"Anime Series, Kids' TV","Hailing from the mountains of Iga, Kanzo Hatto..."
8805,s4915,TV Show,海的儿子,,"Li Nanxing, Christopher Lee, Jesseca Liu, Appl...",,"April 27, 2018",2016,TV-14,1 Season,"International TV Shows, TV Dramas","Two brothers start a new life in Singapore, wh..."
8806,s7102,TV Show,마녀사냥,,"Si-kyung Sung, Se-yoon Yoo, Dong-yup Shin, Ji-...",South Korea,"February 19, 2018",2015,TV-MA,1 Season,"International TV Shows, Korean TV Shows, Stand...",Four Korean celebrity men and guest stars of b...
8807,s5023,Movie,반드시 잡는다,Hong-seon Kim,Baek Yoon-sik,South Korea,"February 28, 2018",2017,TV-MA,110 min,"Dramas, International Movies, Thrillers",After people in his town start turning up dead...


In [7]:
## since each show id has a distinct value, will count distinct by the show id released in 2018 

distinct_movies_2018 = spark.sql( """
    SELECT COUNT(DISTINCT show_id) AS distinct_movies_or_shows_in_2018
    FROM netflix_table
    WHERE release_year = 2018
""")


In [32]:
pandas_distinct_movies = distinct_movies_2018.toPandas() ## transfering to pandas 

In [26]:
pandas_distinct_movies

Unnamed: 0,distinct_movies_or_shows_in_2018
0,1145


In [16]:
## country that has produced most movies 

countries_produced_most_movies = spark.sql("""
    SELECT country, COUNT(*) AS movies_count_by_country
    FROM netflix_table
    GROUP BY country
    ORDER BY movies_count_by_country DESC
    LIMIT 1
""")


In [17]:
countries_produced_most_movies.show() ## here the united states produced most of the data 
## since eah row has a unique ID, no need to count by the distinct values 

+-------------+-----------------------+
|      country|movies_count_by_country|
+-------------+-----------------------+
|United States|                   2805|
+-------------+-----------------------+



In [33]:
### showing the number of movies that lasted 1 season 
TVshows_lasted_for_season=spark.sql("""SELECT COUNT (*) FROM netflix_table
                                    WHERE duration='1 Season' """)



In [34]:
TVshows_lasted_for_season.show() ## showing the number of movies lasted for 1 season 

+--------+
|count(1)|
+--------+
|    1791|
+--------+



In [35]:
## which year had the least number of tv shows produced 
year_which_have_lowest_producton = spark.sql("""
SELECT release_year
FROM netflix_table
WHERE type = 'TV Show'
GROUP BY release_year
ORDER BY COUNT(*) ASC
LIMIT 1
""")


In [12]:
year_which_have_lowest_producton.show()

+------------+
|release_year|
+------------+
|        1972|
+------------+



In [39]:
## when was the earliest release date for a movie in the dataset.

earliest_movie_released_year = spark.sql("""
    SELECT MIN(release_year) AS earliest_release_year
    FROM netflix_table
    WHERE type = 'Movie'
""")


In [40]:
earliest_movie_released_year.show()

+---------------------+
|earliest_release_year|
+---------------------+
|       Charles Rocket|
+---------------------+

