<a href="https://colab.research.google.com/github/anshmehta26/lumaa-spring-2025-ai-ml/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
#Connecting to google drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
#Accessing stored csv file and converting it into a Pandas DataFrame before displaying how many movies it contains
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/500_netflix_movies.csv')

print("The database has " + str(len(df))+ " movies")

The database has 500 movies


In [14]:
import math
import string

# Basic Text Preprocessing
def preprocess_text(text):
    """
    Lowercase the text, remove punctuation, and split into tokens.

    Parameters:
        text (str): The text to preprocess.

    Returns:
        list: A list of tokens.
    """
    text = text.lower()
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    tokens = text.split()
    return tokens

In [15]:
# Basic TF-IDF Vectorizer Implementation

class BasicTfidfVectorizer:
    def __init__(self):
        self.vocabulary_ = {}  # Maps token to index
        self.idf_ = {}         # Maps token to its IDF value
        self.N = 0             # Number of documents

    def fit(self, documents):
        """
        Build vocabulary and compute IDF values from the documents.

        Parameters:
            documents (list of str): The corpus of documents.
        """
        self.N = len(documents)
        doc_freq = {}

        # Count document frequency for each token (unique tokens per document)
        for doc in documents:
            tokens = set(preprocess_text(doc))
            for token in tokens:
                doc_freq[token] = doc_freq.get(token, 0) + 1

        # Build vocabulary and compute IDF for each token
        self.vocabulary_ = {}
        self.idf_ = {}
        for i, token in enumerate(doc_freq.keys()):
            self.vocabulary_[token] = i
            # Using smoothing: idf = log((N + 1) / (df + 1)) + 1
            self.idf_[token] = math.log((self.N + 1) / (doc_freq[token] + 1)) + 1

    def transform(self, documents):
        """
        Transform documents into TF-IDF vectors.

        Parameters:
            documents (list of str): Documents to transform.

        Returns:
            list of list: Each document represented as a list of TF-IDF values.
        """
        vectors = []
        for doc in documents:
            tokens = preprocess_text(doc)
            tf = {}
            # Compute term frequency for each token in the document
            for token in tokens:
                if token in self.vocabulary_:
                    tf[token] = tf.get(token, 0) + 1
            # Normalize term frequencies by the maximum frequency in the document
            max_freq = max(tf.values()) if tf else 1
            vec = [0.0] * len(self.vocabulary_)
            for token, count in tf.items():
                tf_norm = count / max_freq
                index = self.vocabulary_[token]
                vec[index] = tf_norm * self.idf_[token]
            vectors.append(vec)
        return vectors

    def fit_transform(self, documents):
        """
        Fit to the documents and transform them into TF-IDF vectors.

        Parameters:
            documents (list of str): The corpus of documents.

        Returns:
            list of list: TF-IDF vectors for each document.
        """
        self.fit(documents)
        return self.transform(documents)


In [16]:
import numpy as np

#Cosine similarity function
def cosine_similarity_vec(vec1, vec2):
    """
    Compute cosine similarity between two vectors using numpy.

    Parameters:
        vec1 (list or np.array): First vector.
        vec2 (list or np.array): Second vector.

    Returns:
        float: Cosine similarity score.
    """
    # Convert the vectors to numpy arrays if they're not already
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)

    # Compute the dot product using numpy
    dot_product = np.dot(vec1, vec2)

    # Compute the L2 norms (Euclidean norms) of the vectors
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    # Avoid division by zero
    if norm1 == 0 or norm2 == 0:
        return 0.0

    return dot_product / (norm1 * norm2)


In [17]:
# Recommendation Function

def recommend_items(query, vectorizer, tfidf_matrix, df, top_n=5):
    """
    Given a user query, compute cosine similarity between the query and each movie description.

    Parameters:
        query (str): User's text describing their preferences.
        vectorizer (BasicTfidfVectorizer): A fitted basic TF-IDF vectorizer.
        tfidf_matrix (list of list): TF-IDF vectors for all movie descriptions.
        df (pd.DataFrame): DataFrame containing the movies with a 'title' column.
        top_n (int): Number of top recommendations to return.

    Returns:
        list: A list of tuples (movie title, similarity score).
    """
    # Transform the query using the same vectorizer
    query_vec = vectorizer.transform([query])[0]

    # Compute cosine similarity between the query and each movie description
    similarities = []
    for idx, doc_vec in enumerate(tfidf_matrix):
        score = cosine_similarity_vec(query_vec, doc_vec)
        similarities.append((idx, score))

    # Sort the movies by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Retrieve the top_n recommendations
    recommendations = [(df.iloc[idx]['title'], score) for idx, score in similarities[:top_n]]
    return recommendations


In [18]:
#Example implementation

#Accessing stored csv file and converting it into a Pandas DataFrame before displaying how many movies it contains
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/500_netflix_movies.csv')
print(f"Number of movies in dataset: {len(df)}")

# Build the TF-IDF vectorizer using the 'description' column
# Fill missing descriptions with an empty string
descriptions = df['description'].fillna("").tolist()
vectorizer = BasicTfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(descriptions)

# Example user query
query = "I love thrilling action movies set in space, with a comedic twist."

# Get recommendations
recommendations = recommend_items(query, vectorizer, tfidf_matrix, df, top_n=5)

# Print the top recommendations
print("Top Recommendations:")
for title, score in recommendations:
    print(f"{title} (Similarity Score: {score:.2f})")

Number of movies in dataset: 500
Top Recommendations:
A StoryBots Space Adventure (Similarity Score: 0.23)
2 Hearts (Similarity Score: 0.22)
Chhota Bheem: Bheem vs Aliens (Similarity Score: 0.13)
Boyka: Undisputed (Similarity Score: 0.13)
SAS: Rise of the Black Swan (Similarity Score: 0.13)


Salary Expectation: **40 dollars/hour** atleast which translates to **6900(approx)/month**. Can be negotiated if appropriate housing/transportation is provided. I am a student at Cornell University so relocating from Ithaca will be a challenge as long as there is appropriate compensation to help me move and start my job the hourly wage is negotiable.