In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys
import random

In [9]:
def sample_csv(input_file, output_file, sample_size=500, random_state=None, seed=42):
    random.seed(seed)

    df = pd.read_csv(input_file)

    sampled_df = df[['Series_Title', 'Overview', 'Genre']].sample(n=sample_size, random_state=random_state)
    sampled_df.rename(columns={'Series_Title': 'title'}, inplace=True)
    sampled_df['overview'] = sampled_df['Overview'] + ' Genre: ' + sampled_df['Genre']

    sampled_df = sampled_df[['title', 'overview']]
    sampled_df.to_csv(output_file, index=False)

    print(f"Sampled {sample_size} rows and saved to {output_file}")

In [10]:
sample_csv("data/imdb_top_1000.csv", "data/movie_500.csv")

Sampled 500 rows and saved to data/movie_500.csv


In [21]:
def load_data(file_path='data/movie_500.csv'):
    df = pd.read_csv(file_path)
    return df.dropna()

In [17]:
def vectorize_text(data):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data)
    return vectorizer, tfidf_matrix

In [18]:
def recommend(user_input, df, vectorizer, tfidf_matrix, top_n=5):
    user_vec = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices], similarities[top_indices]

In [23]:
def main(user_input):
    # if len(sys.argv) != 2:
    #     print("Usage: python recommend.py <user_input>")
    #     sys.exit(1)

    # user_input = sys.argv[1]

    df = load_data()
    vectorizer, tfidf_matrix = vectorize_text(df['overview'])

    recommendations, scores = recommend(user_input, df, vectorizer, tfidf_matrix)

    print("Top Recommendations:")
    for i, (title, score) in enumerate(zip(recommendations['title'], scores), 1):
        print(f"{i}. {title}")

In [24]:
main("I like action movies set in space")

Top Recommendations:
1. Aliens
2. The Man Who Would Be King
3. Ghostbusters
4. Clerks
5. Blade Runner
