In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import re

In [0]:
service_credential = dbutils.secrets.get(scope="<scope>",key="<service-credential-key>")

spark.conf.set("fs.azure.account.auth.type.<storage-account>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<storage-account>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<storage-account>.dfs.core.windows.net", "<application-id>")
spark.conf.set("fs.azure.account.oauth2.client.secret.<storage-account>.dfs.core.windows.net", service_credential)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<storage-account>.dfs.core.windows.net", "https://login.microsoftonline.com/<directory-id>/oauth2/token")

In [0]:
#load data to the dataframe
enriched_movies_df = spark.read.format('csv')\
        .option("header", True )\
        .option("inferSchema", True )\
        .load('abfss://silver@movierecommendation2025.dfs.core.windows.net/enriched_movies.csv')

In [0]:
enriched_movies_df.display()

movieId,title,genres,year,overview,popularity,release_date,vote_average,vote_count
109,Headless Body in Topless Bar,Comedy|Drama|Thriller,1995,An ex-con holds a group of people hostage in a topless bar.,0.09335267857142858,1995-05-20,2.7,3
381,When a Man Loves a Woman,Drama|Romance,1994,"An airline pilot and his wife are forced to face the consequences of her alcoholism when her addictions threaten her life and their daughter's safety. While the woman enters detox, her husband must face the truth of his enabling behavior.",1.3476651785714286,1994-04-29,6.551,352
681,Coup de torchon,Crime,1981,"A pathetic police chief, humiliated by everyone around him, suddenly wants a clean slate in life, and resorts to drastic means to achieve it.",0.5157785714285714,1981-11-04,7.086,197
745,Wallace & Gromit: A Close Shave,Animation|Children|Comedy,1995,"Wallace's whirlwind romance with the proprietor of the local wool shop puts his head in a spin, and Gromit is framed for sheep-rustling in a fiendish criminal plot.",1.143230357142857,1996-03-07,7.6,849
1161,The Tin Drum,Drama|War,1979,"Oskar Matzerath is a very unusual boy. Refusing to leave the womb until promised a tin drum by his mother, Agnes, Oskar is reluctant to enter a world he sees as filled with hypocrisy and injustice, and vows on his third birthday to never grow up. Miraculously, he gets his wish. As the Nazis rise to power in Danzig, Oskar wills himself to remain a child, beating his tin drum incessantly and screaming in protest at the chaos surrounding him.",1.3796232142857143,1979-05-02,6.9,475
1381,Grease 2,Comedy|Musical|Romance,1982,"It's 1961, two years after the original Grease gang graduated, and there's a new crop of seniors and new members of the coolest cliques on campus, the Pink Ladies and T-Birds. Michael Carrington is the new kid in school - but he's been branded a brainiac. Can he fix up an old motorcycle, don a leather jacket, avoid a rumble with the leader of the T-Birds, and win the heart of Pink Lady Stephanie?",1.8334767857142857,1982-06-11,5.2,691
1410,The Evening Star,Comedy|Drama,1996,"Continuing the story of Aurora Greenway in her latter years. After the death of her daughter, Aurora struggled to keep her family together, but has one grandson in jail, a rebellious granddaughter, and another grandson living just above the poverty line.",0.38218035714285714,1996-12-25,5.9,68
1678,The Joy Luck Club,Drama|Romance,1993,"Through a series of flashbacks, four Chinese women born in America and their respective mothers born in feudal China explore their pasts.",0.9268732142857142,1993-09-08,7.126,147
2230,Always Tell Your Wife,Comedy,1914,A comedic short filmed directed by an uncredited Alfred Hitchcock about an affair.,0.12017678571428571,1923-02-10,5.7,3
2344,Runaway Train,Action|Adventure|Drama|Thriller,1985,A hardened convict and a younger prisoner escape from a brutal prison in the middle of winter only to find themselves on an out-of-control train with a female railway worker while being pursued by the vengeful head of security.,1.1610919642857145,1985-11-15,6.9,591


In [0]:
# Count missing values in each column
missing_values = enriched_movies_df.select(
    [count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in enriched_movies_df.columns]
)

# Display the count of missing values
missing_values.show()

+-------+-----+------+----+--------+----------+------------+------------+----------+
|movieId|title|genres|year|overview|popularity|release_date|vote_average|vote_count|
+-------+-----+------+----+--------+----------+------------+------------+----------+
|      0|   32|    36|  40|    5678|       155|        5577|         191|       200|
+-------+-----+------+----+--------+----------+------------+------------+----------+



In [0]:
#dropping rows with missing values in title and genres columns
enriched_movies_df = enriched_movies_df.na.drop(subset=["title", "genres"])

In [0]:
# Count missing values in each column
missing_values = enriched_movies_df.select(
    [count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in enriched_movies_df.columns]
)

# Display the count of missing values
missing_values.show()

+-------+-----+------+----+--------+----------+------------+------------+----------+
|movieId|title|genres|year|overview|popularity|release_date|vote_average|vote_count|
+-------+-----+------+----+--------+----------+------------+------------+----------+
|      0|    0|     0|   6|    5644|       119|        5541|         155|       164|
+-------+-----+------+----+--------+----------+------------+------------+----------+



In [0]:
#fill missing values with default values
enriched_movies_df = enriched_movies_df.fillna({
    "overview": "No overview available",
    "popularity": 0.0,
    "vote_average": 0.0,
    "vote_count": 0
})


In [0]:
# Count missing values in each column
missing_values = enriched_movies_df.select(
    [count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in enriched_movies_df.columns]
)

# Display the count of missing values
missing_values.show()

+-------+-----+------+----+--------+----------+------------+------------+----------+
|movieId|title|genres|year|overview|popularity|release_date|vote_average|vote_count|
+-------+-----+------+----+--------+----------+------------+------------+----------+
|      0|    0|     0|   6|       0|         0|        5541|           0|         0|
+-------+-----+------+----+--------+----------+------------+------------+----------+



In [0]:
enriched_movies_df.repartition(1).write.format('csv') \
    .mode('overwrite') \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("path", "abfss://gold@movierecommendation2025.dfs.core.windows.net/movies") \
    .save()

In [0]:
#load data to the dataframe
ratings_df = spark.read.format('csv')\
        .option("header", True )\
        .option("inferSchema", True )\
        .load('abfss://silver@movierecommendation2025.dfs.core.windows.net/ratings/ratings')

In [0]:
ratings_df.repartition(1).write.format('csv')\
    .mode('overwrite')\
    .option("header", "true")\
    .option("path", "abfss://gold@movierecommendation2025.dfs.core.windows.net/ratings")\
    .save()