In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType, DoubleType
import spacy
from numpy import dot
from numpy.linalg import norm

# Initialize Spark Session
spark = SparkSession.builder.appName("NetflixMovieSimilarity").getOrCreate()

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Load dataset
data = spark.read.csv("file:///home/ymusinguzi/Desktop/netflix_titles.csv", header=True, inferSchema=True)

# Get reference sentence for "Blood & Water"
ref_sent = data.filter(col("title") == "Blood & Water").select("description").collect()[0][0]
ref_sent_vec = nlp(ref_sent).vector.tolist()

# UDF to vectorize text
def text_to_vector(text):
    if text:
        return nlp(text).vector.tolist()
    return None

vector_udf = udf(text_to_vector, ArrayType(FloatType()))
data = data.withColumn("vector", vector_udf(col("description")))

# UDF for cosine similarity
def cosine_similarity(vec):
    if vec:
        return float(dot(ref_sent_vec, vec) / (norm(ref_sent_vec) * norm(vec)))
    return 0.0

cosine_sim_udf = udf(cosine_similarity, DoubleType())
data = data.withColumn("similarity", cosine_sim_udf(col("vector")))

# Get top 5 similar movies (excluding "Blood & Water" itself)
top_5 = data.orderBy(col("similarity").desc()).filter(col("title") != "Blood & Water").limit(5)

# Show results
top_5.select("title", "description", "similarity").show(truncate=False)

# Print formatted output
for row in top_5.collect():
    print(f'Title: {row["title"]}\nDescription: {row["description"]}\nSimilarity Score: {row["similarity"]:.2f}\n')

25/04/01 01:25:41 WARN Utils: Your hostname, Ubuntu-Linux-YvonneMusinguzi resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/04/01 01:25:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/01 01:25:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/01 01:25:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|title             |description                                                                                                                                           |similarity        |
+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|Virgin River      |Searching for a fresh start, a nurse practitioner moves from LA to a remote northern California town and is surprised by what – and who – she finds.  |0.9436762847312361|
|A Perfect Ending  |After confessing an unusual secret, a repressed wife – prompted by her friends – decides to explore her sexuality with a high-priced call girl.       |0.9423923543182853|
|A Champion Heart  |When a grieving teen must

                                                                                

Title: Virgin River
Description: Searching for a fresh start, a nurse practitioner moves from LA to a remote northern California town and is surprised by what – and who – she finds.
Similarity Score: 0.94

Title: A Perfect Ending
Description: After confessing an unusual secret, a repressed wife – prompted by her friends – decides to explore her sexuality with a high-priced call girl.
Similarity Score: 0.94

Title: A Champion Heart
Description: When a grieving teen must work off her debt to a ranch, she cares for a wounded horse that teaches her more about healing than she expected.
Similarity Score: 0.94

Title: Kuch Kuch Hota Hai
Description: Per her mother’s last wish, a girl sets out to reunite her father with the college best friend who loved him – only to discover the woman is engaged.
Similarity Score: 0.94

Title: The Kite
Description: In an occupied village, a teen girl is set to wed a stranger. But when she crosses over to meet her betrothed, her heart gets entangled at the bo

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string

# Download NLTK resources (if you haven't already)
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load your CSV file (replace 'your_file.csv' with your actual file name)
df = pd.read_csv('C:/Users/user/OneDrive - Coventry University/Desktop/Big Data/netflix_titles.csv')

# Check the first few rows of your data
print(df.head())

# Tokenize the descriptions and remove stopwords
stop_words = set(stopwords.words('english'))

# Tokenize the descriptions column and remove stop words & punctuation
df['words'] = df['description'].apply(lambda x: word_tokenize(x.lower()))
df['words'] = df['words'].apply(lambda x: [word for word in x if word.isalpha() and word not in stop_words])

# Flatten the list of words and count frequency
words = [word for sublist in df['words'] for word in sublist]
word_counts = Counter(words)

# Create a new DataFrame from the word counts
word_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency'])

# Save the new CSV file with word frequencies
word_df.to_csv('word_frequencies.csv', index=False)

# Display the top 10 most frequent words
print(word_df.head(10))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [7]:
import pandas as pd
from collections import Counter
import re

# Load your dataset
df = pd.read_csv("C:/Users/user/OneDrive - Coventry University/Desktop/Big Data/netflix_titles.csv")

# Check for missing values in the 'cast' column
print("Missing values in 'cast' column:", df['cast'].isnull().sum())

# Remove rows with missing or null values in the 'cast' column
df = df.dropna(subset=['cast'])

# Optionally, you can also remove rows where 'cast' is an empty string
df = df[df['cast'].str.strip().ne('')]

# Check the first few rows after cleaning
print(df.head())

# Clean the 'cast' column by removing commas and extra spaces, and tokenize the actors
df['cast'] = df['cast'].apply(lambda x: re.sub(r'[^\w\s,]', '', x))  # Remove unwanted characters
df['cast'] = df['cast'].apply(lambda x: x.split(','))  # Tokenize by comma

# Flatten the list of actors into a single list
actors = [actor.strip() for sublist in df['cast'] for actor in sublist]

# Count the frequency of each actor
actor_counts = Counter(actors)

# Get the top 10 actors by frequency
top_actors = actor_counts.most_common(10)

# Convert the top actors into a DataFrame
top_actors_df = pd.DataFrame(top_actors, columns=['Actor', 'Frequency'])

# Show the top actors DataFrame
print(top_actors_df)

# Optionally, save the result to a CSV
top_actors_df.to_csv("top_actors.csv", index=False)


Missing values in 'cast' column: 825
  show_id     type                             title  \
1      s2  TV Show                     Blood & Water   
2      s3  TV Show                         Ganglands   
4      s5  TV Show                      Kota Factory   
5      s6  TV Show                     Midnight Mass   
6      s7    Movie  My Little Pony: A New Generation   

                        director  \
1                            NaN   
2                Julien Leclercq   
4                            NaN   
5                  Mike Flanagan   
6  Robert Cullen, José Luis Ucha   

                                                cast       country  \
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...  South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...           NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...         India   
5  Kate Siegel, Zach Gilford, Hamish Linklater, H...           NaN   
6  Vanessa Hudgens, Kimiko Glenn, James Marsden, ...       