In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [2]:
genome_score = pd.read_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/genome-scores.csv")
genome_tags = pd.read_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/genome-tags.csv")
links = pd.read_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/links.csv")
movies = pd.read_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/movies.csv")
ratings = pd.read_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/ratings.csv")
tags = pd.read_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/tags.csv")


In [3]:
genome_score.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [4]:
genome_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [5]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [9]:
# Sample a subset of the data (e.g., 100,000 rows)
ratings_sample = ratings.sample(n=100000, random_state=42)

# Create the user-item matrix from the sample
user_item_matrix = ratings_sample.pivot(index='userId', columns='movieId', values='rating').fillna(0)
# we create this to she the userid, movies id and rating for each value given by the user

In [10]:
# Merge genome scores with genome tags to get tag names
genome_scores_with_tags = pd.merge(genome_score, genome_tags, on='tagId')
#we merge it to get movieid,tagid,revelence and tag

# Aggregate relevance scores by movieId and tag
movie_genome_scores = genome_scores_with_tags.groupby(['movieId', 'tag']).agg({'relevance': 'mean'}).reset_index()
#here we get movieid, tag and relevance. relevance and tags is shown for each movieid

# Pivot the table to create a movie x tag matrix
movie_tag_matrix = movie_genome_scores.pivot(index='movieId', columns='tag', values='relevance').fillna(0)
#this will show how much movie id has relevance for the tags

In [11]:
# Normalize the tag relevance scores
scaler = StandardScaler()
movie_tag_matrix_scaled = pd.DataFrame(scaler.fit_transform(movie_tag_matrix), index=movie_tag_matrix.index, columns=movie_tag_matrix.columns)
#we normalize here so as to get proper distribuation for each component 

In [12]:
movie_tag_matrix_scaled.head()

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.299067,-0.336026,-0.107075,0.119677,0.062468,1.281507,0.065383,0.797288,1.072656,-0.385918,...,-0.411134,-0.340451,-0.066096,-0.421464,-0.981798,-0.575122,-0.306067,-0.064619,-0.318061,-0.088731
2,-0.088351,-0.029456,-0.290811,-0.414958,-0.184093,-0.18998,-0.159678,-0.386521,-0.088187,-0.507606,...,-0.393369,-0.324225,-0.256248,-0.529755,-0.977855,-0.795576,-0.357857,-0.221314,-0.199162,-0.141126
3,-0.03478,0.150398,-0.380382,-0.096919,-0.564067,-0.230713,-0.063223,0.195712,-0.44482,-0.441471,...,-0.395343,-0.22687,-0.186192,-0.402354,-0.741292,-0.630236,-0.337141,-0.186695,-0.134957,-0.144208
4,-0.124066,-0.098945,-0.299998,-0.601395,-0.467807,-0.515845,-0.44612,-0.562545,-0.430555,-0.404436,...,-0.269011,-0.136006,-0.219552,-0.332283,-0.506701,-0.369171,-0.364763,-0.21767,-0.444093,-0.212013
5,-0.056208,0.117697,-0.093295,-0.538335,-0.420521,0.347189,-0.38474,-0.57802,-0.585691,-0.420308,...,-0.387447,-0.207399,-0.227892,-0.50746,-0.841831,-0.815881,-0.344046,-0.206737,-0.035083,-0.14729


In [13]:
# Matrix Factorization using SVD
svd = TruncatedSVD(n_components=50, random_state=52)
latent_factors = svd.fit_transform(user_item_matrix)
Vt = svd.components_
#we are creating SVD model here. 

In [14]:
# Step 4: Reconstruct the User-Item Matrix (Predicted Ratings)

# Reconstruct the matrix using the latent factors
predicted_ratings = np.dot(latent_factors, Vt)

# Convert it back to a DataFrame with the same shape as the original user-item matrix
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)


In [15]:
# Compute cosine similarity between movies based on the tag features
content_based_similarity = cosine_similarity(movie_tag_matrix_scaled, movie_tag_matrix_scaled)
content_based_similarity_df = pd.DataFrame(content_based_similarity, index=movie_tag_matrix.index, columns=movie_tag_matrix.index)


In [16]:
# Hybrid Recommendation Function
def hybrid_recommendation(user_id, top_n=10, alpha=0.5):
    # Get collaborative filtering predictions
    user_predictions = predicted_ratings_df.loc[user_id]
    
    # Sort movies by collaborative filtering score
    top_movies_cf = user_predictions.sort_values(ascending=False).index.tolist()
    
    # Get content-based scores for these movies
    movie_ids = [movie_id for movie_id in top_movies_cf if movie_id in content_based_similarity_df.index]
    movie_scores = {movie_id: np.mean(content_based_similarity_df[movie_id].loc[movie_ids]) for movie_id in movie_ids}
# Combine collaborative filtering and content-based scores
    combined_scores = {}
    for movie_id in movie_ids:
        cf_score = user_predictions[movie_id]
        content_score = movie_scores[movie_id]
        combined_scores[movie_id] = alpha * cf_score + (1 - alpha) * content_score
# Sort movies by combined score and return top N
    recommended_movie_ids = sorted(combined_scores, key=combined_scores.get, reverse=True)[:top_n]
    
    # Return movie titles
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)][['movieId', 'title']]
    
    return recommended_movies        

In [17]:
# Example: Get top 10 recommendations for user with ID 1
user_id = 1
top_n_recommendations = hybrid_recommendation(user_id, top_n=10, alpha=0.5)

print(f"Top 10 Movie Recommendations for User {user_id}:")
for idx, movie in enumerate(top_n_recommendations.itertuples(), 1):
    print(f"{idx}. {movie.title}")

Top 10 Movie Recommendations for User 1:
1. Pretty Woman (1990)
2. Mission: Impossible (1996)
3. Rock, The (1996)
4. Aliens (1986)
5. Goodfellas (1990)
6. Godfather: Part II, The (1974)
7. Terminator, The (1984)
8. Good Will Hunting (1997)
9. Shrek (2001)
10. Lord of the Rings: The Return of the King, The (2003)


In [18]:
genome_score.to_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/genome-scores.csv")
genome_tags.to_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/genome-tags.csv")
links.to_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/links.csv")
movies.to_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/movies.csv")
ratings.to_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/ratings.csv")
tags.to_csv("C:/Users/User/Desktop/PROJECTS/Recommedation/ml-20m/tags.csv")
