In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
from fuzzywuzzy import process      # use for mispelling error and matching


# Function to clean movie titles
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title


# Function to extract year from movie title
def extract_year(title):
    match = re.search(r'\(([0-9]+)\)', title)
    return int(match.group(1)) if match else None


# Function to search for movies
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    return results


# Function to find similar movies based on a given movie ID
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    if len(similar_users) == 0:
        return pd.DataFrame(columns=["score", "title", "genres"])
    
    
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    if similar_user_recs.empty:
        return pd.DataFrame(columns=["score", "title", "genres"])
    
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]



# Load movies and ratings data
try:
    movies = pd.read_csv("movies.csv")
    ratings = pd.read_csv("ratings.csv")
except FileNotFoundError:
    print("Error: Data files not found. Please make sure 'movies.csv' and 'ratings.csv' are present.")

    
    
# Preprocess movie titles
movies["clean_title"] = movies["title"].apply(clean_title)

# Extract year from movie titles
movies["year"] = movies["title"].apply(extract_year)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

# Create widgets
movie_input = widgets.Text(value='', description='Movie Name:', disabled=False)
recommendation_output = widgets.Output()



# Function to handle movie input change
def handle_movie_input_change(change):
    with recommendation_output:
        clear_output(wait=True)
        title = change.new.strip()
        if title:
            # Use fuzzy string matching to find similar movie titles
            title_match = process.extractOne(title, movies["title"])
            title = title_match[0]  # Get the best match
            results = search(title)
            if not results.empty:
                movie_id = results.iloc[0]["movieId"]
                display(find_similar_movies(movie_id))

                
                
# Listen for changes in the movie input field
movie_input.observe(handle_movie_input_change, names='value')

# Display widgets
display(widgets.VBox([movie_input, recommendation_output]))
