In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import joblib  # Import joblib for model persistence

# Load dataset
df = pd.read_csv("movies_dataset.csv")

# Label encode movie titles
label_encoder = LabelEncoder()
df['Title_encoded'] = label_encoder.fit_transform(df['title'])

# Create a user-movie matrix
user_movie_matrix = df.set_index('Title_encoded').drop(columns=['title'])

# Calculate the cosine similarity between movies
cosine_sim = cosine_similarity(user_movie_matrix, user_movie_matrix)

# Create a DataFrame with the similarity scores
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Save the model
joblib.dump(cosine_sim_df, 'cosine_similarity_model.joblib')

# Function to get movie recommendations based on user's input
def get_movie_recommendations(movie_title, cosine_sim_df, df):
    movie_index = df[df['title'] == movie_title]['Title_encoded'].iloc[0]
    similar_movies = cosine_sim_df[movie_index].sort_values(ascending=False)
    recommendations = similar_movies.index[1:4]  # Get top 3 recommendations (excluding the input movie itself)
    recommended_movies = df[df['Title_encoded'].isin(recommendations)]['title'].tolist()
    return recommended_movies

# Example usage:
input_movie = "12 Angry Men (1957)"
recommended_movies = get_movie_recommendations(input_movie, cosine_sim_df, df)
print(f"\nRecommended movies for '{input_movie}': {recommended_movies}")

# Load the model
loaded_model = joblib.load('cosine_similarity_model.joblib')



Recommended movies for '12 Angry Men (1957)': ['Casablanca (1942)', "It's a Wonderful Life (1946)", "One Flew Over the Cuckoo's Nest (1975)"]


In [2]:
df.head()

Unnamed: 0,title,1,2,3,4,5,6,7,8,9,...,602,603,604,605,606,607,608,609,610,Title_encoded
0,10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0
1,12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5,2
3,28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0,3
4,300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0,4


In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("movies_dataset.csv")

# Extract released year from movie titles and create a new column
df['Released Year'] = df['title'].str.extract(r'\((\d{4})\)')
df['title'] = df['title'].replace(r'\s\(\d{4}\)', '', regex=True)  # Remove the year from the Title column

# Save the preprocessed data
df.to_csv("preprocessed_movies_dataset.csv", index=False)


In [6]:
df.head()

Unnamed: 0,title,1,2,3,4,5,6,7,8,9,...,602,603,604,605,606,607,608,609,610,Released Year
0,10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1999
1,12 Angry Men,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1957
2,2001: A Space Odyssey,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5,1968
3,28 Days Later,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0,2002
4,300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0,2007
