## Importing all libraries

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, desc, avg, max, min, split, length, collect_list
import os

## Creating spark session

In [12]:
def create_spark_session():
    return SparkSession.builder \
        .appName("Netflix Dataset EDA") \
        .getOrCreate()

## Loading Dataset

In [13]:
def load_netflix_dataset(spark, file_path):
    return spark.read.csv(file_path, header=True, inferSchema=True)

## Exploring Dataset

In [14]:
def explore_data(df):
    print("1. Dataset Overview:")
    print("Columns:", df.columns)
    print("Record Count:", df.count())
    df.describe().show()

    print("\n2. Top 10 Directors by Number of Titles:")
    directors = df.select("director") \
        .groupBy("director") \
        .count() \
        .orderBy(F.desc("count")) \
        .limit(10)
    directors.show()

    print("\n3. Average Release Year by Content Type:")
    release_year_stats = df.groupBy("type") \
        .agg(
            F.mean("release_year").alias("average_release_year")
        ).orderBy("type")
    release_year_stats.show()

    print("\n4. Duration Analysis by Content Type:")
    durations = df.withColumn("numeric_duration", 
                              F.split(F.col("duration"), " ")[0].cast("int")) \
        .groupBy("type") \
        .agg(
            F.mean("numeric_duration").alias("average_duration"),
            F.max("numeric_duration").alias("max_duration"),
            F.min("numeric_duration").alias("min_duration")
        )
    durations.show()

    print("\n5. Countries with Most Genres:")
    genre_counts = df.groupBy("country") \
        .agg(F.countDistinct("listed_in").alias("unique_genres")) \
        .orderBy(F.desc("unique_genres")) \
        .limit(10)
    genre_counts.show()

    print("\n6. Titles with the Most Characters:")
    longest_titles = df.select("title", F.length("title").alias("title_length")) \
        .orderBy(F.desc("title_length")) \
        .limit(10)
    longest_titles.show()

    print("\n7. Ratings Distribution:")
    ratings_distribution = df.groupBy("rating") \
        .count() \
        .orderBy(F.desc("count"))
    ratings_distribution.show()

In [15]:
spark = create_spark_session()

## Displaying Datasets

In [16]:

dataset_path = 'netflix_titles.csv' 

netflix_df = load_netflix_dataset(spark, dataset_path)

perform_eda(netflix_df)

1. Basic Dataset Information:
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)


Total number of records: 8809

2. Top Directors with the Most Titles:
+--------------------+-----------+
|            director|title_count|
+--------------------+-----------+
|                NULL|       2636|
|       Rajiv Chilaka|         19|
|Raúl Campos, Jan ...|         18|
|        Marcus Raboy|         16|
|         Suhas Kadav|         16|
|           Jay Karas|         14|
| Cathy Garcia-Molina|         13|
|     Youssef Chahine|         12|
|     Martin Scorsese|         12

In [17]:
spark.stop()