## To perform data tranformations from the given dataset suitable to train the model

In [43]:
import numpy as np
import pandas as pd
import pickle as pickle
from collections import defaultdict
import re

### Displaying the datasets that's relevant for the data prep

In [44]:
df_movies = pd.read_csv("data/movies.csv")
print(df_movies.head(3))

   movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  


In [45]:
df_ratings = pd.read_csv("data/ratings.csv")
print(df_ratings.head(5))

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


### Merging the two datasets movies.csv and ratings.csv to perform cleaning,analysis and transformations

In [46]:
movieId_ratings_userId_merged =  df_ratings.merge(df_movies, on = "movieId")

print(movieId_ratings_userId_merged.head(5))

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


### Check the number of ratings on the dataaset for each genre, so that  certain startegies can be employed for under represented genre.

In [47]:
genre_rating_counts = defaultdict(int)  # dict to store counts

for _, row in movieId_ratings_userId_merged.iterrows():
    genres = row["genres"].split("|")
    for genre in genres:
        genre_rating_counts[genre] += 1  

# Sort by highest count
genre_rating_counts = dict(sorted(genre_rating_counts.items(), key=lambda x: x[1], reverse=True))

print(genre_rating_counts)

{'Drama': 41928, 'Comedy': 39053, 'Action': 30635, 'Thriller': 26452, 'Adventure': 24161, 'Romance': 18124, 'Sci-Fi': 17243, 'Crime': 16681, 'Fantasy': 11834, 'Children': 9208, 'Mystery': 7674, 'Horror': 7291, 'Animation': 6988, 'War': 4859, 'IMAX': 4145, 'Musical': 4138, 'Western': 1930, 'Documentary': 1219, 'Film-Noir': 870, '(no genres listed)': 47}


#### Coming up with the cleaning ideas
    If the genre has very low total ratings (like 870 for film-noir 0r 1930 for western),  Neural nets may  get very little signal for it.
    We can considewr tw0 strategies : 
    Either we can drop them to make the model simple i.e no risk of noise from the under representd geners or we can keep it but predictions for that genre will be less accurate unless we use oversampling or genre embeddings.

    Since we’re building a neural net recommender, I’d lean toward dropping genres that fall below a certain ratings threshold, for ex: < 2000 ratings, because:
    It avoids imbalance.
    It simplifies the output dimension for this genre representation.
    It prevents overfitting to tiny datasets.
    Also genere IMAX isn't a genre , its a technology how film is presnetd. So, this genre can be removed from the further transformations
    Also Documentaries are more popular from the last decade, lets keep that genere.(This conclusiion is derived from the movies that are released after 2000)


In [48]:
skip_genres = {"(no genres listed)", "IMAX", "Western", "Film-Noir"}
user_genre_ratings = defaultdict(lambda: defaultdict(list))  

for _, row in movieId_ratings_userId_merged.iterrows():
    userId = row["userId"]
    ratings = row["rating"]
    genres = row["genres"].split("|")

    for genre in genres:
        if genre not in skip_genres:  # skipping "no genres listed" 
            user_genre_ratings[userId][genre].append(ratings)


       

### Preaparing a tarining set for user network which holds user prefernce for each genre
    First we will create user_vectors where each dimension reprent a genre prefernce from ratings 0 to 5.
    Then from this user_vectors we can create user_training_set for the user-network

In [49]:
# avg of ratings for each genre
user_to_genre = {}  
for userId, genre_dict in user_genre_ratings.items():
    user_to_genre[userId] = {}
    for genre, ratings in genre_dict.items():
        user_to_genre[userId][genre] = round(sum(ratings) / len(ratings), 2) 
        

In [50]:
first_three = dict(list(user_to_genre.items())[:3])
print(first_three)

{1: {'Adventure': 4.39, 'Animation': 4.69, 'Children': 4.55, 'Comedy': 4.28, 'Fantasy': 4.3, 'Romance': 4.31, 'Action': 4.32, 'Crime': 4.36, 'Thriller': 4.15, 'Mystery': 4.17, 'Horror': 3.47, 'Drama': 4.53, 'War': 4.5, 'Sci-Fi': 4.22, 'Musical': 4.68}, 2: {'Crime': 3.8, 'Drama': 3.88, 'Comedy': 4.0, 'Romance': 4.5, 'Action': 3.95, 'Adventure': 4.17, 'Thriller': 3.7, 'War': 4.5, 'Horror': 3.0, 'Mystery': 4.0, 'Documentary': 4.33, 'Sci-Fi': 3.88}, 3: {'Drama': 0.75, 'War': 0.5, 'Action': 3.57, 'Crime': 0.5, 'Adventure': 2.73, 'Comedy': 1.0, 'Animation': 0.5, 'Sci-Fi': 4.2, 'Thriller': 4.14, 'Musical': 0.5, 'Romance': 0.5, 'Fantasy': 3.38, 'Children': 0.5, 'Horror': 4.69, 'Mystery': 5.0}}


In [51]:
# total number of ratings given by each user 
total_ratings_per_user = {
    user_id: (sum(len(ratings) for ratings in genre_dict.values())) / 1.0
    for user_id, genre_dict in user_genre_ratings.items()
}

In [52]:
first_three_users = dict(list(total_ratings_per_user.items())[:3])
print(first_three_users)

{1: 689.0, 2: 70.0, 3: 107.0}


In [53]:
avg_rating_per_user = {
    user_id: round(
        sum(sum(ratings) for ratings in genre_dict.values()) /
        sum(len(ratings) for ratings in genre_dict.values())
    ,2)
    for user_id, genre_dict in user_genre_ratings.items()
}

In [54]:
first_three_users_avg_rating = dict(list(avg_rating_per_user.items())[:5])
print(first_three_users_avg_rating)

{1: 4.35, 2: 3.91, 3: 2.43, 4: 3.53, 5: 3.65}


In [55]:
# adding the two newly calculated cols/inner keys to the "user_to_genre dictionary
for user_id, genre_data in user_to_genre.items():
    if user_id in total_ratings_per_user:
        genre_data['ratings_count'] = total_ratings_per_user[user_id]
    if user_id in avg_rating_per_user:
        genre_data['ratings_ave'] = avg_rating_per_user[user_id]


In [56]:
first_three = dict(list(user_to_genre.items())[:3])
print(first_three)

{1: {'Adventure': 4.39, 'Animation': 4.69, 'Children': 4.55, 'Comedy': 4.28, 'Fantasy': 4.3, 'Romance': 4.31, 'Action': 4.32, 'Crime': 4.36, 'Thriller': 4.15, 'Mystery': 4.17, 'Horror': 3.47, 'Drama': 4.53, 'War': 4.5, 'Sci-Fi': 4.22, 'Musical': 4.68, 'ratings_count': 689.0, 'ratings_ave': 4.35}, 2: {'Crime': 3.8, 'Drama': 3.88, 'Comedy': 4.0, 'Romance': 4.5, 'Action': 3.95, 'Adventure': 4.17, 'Thriller': 3.7, 'War': 4.5, 'Horror': 3.0, 'Mystery': 4.0, 'Documentary': 4.33, 'Sci-Fi': 3.88, 'ratings_count': 70.0, 'ratings_ave': 3.91}, 3: {'Drama': 0.75, 'War': 0.5, 'Action': 3.57, 'Crime': 0.5, 'Adventure': 2.73, 'Comedy': 1.0, 'Animation': 0.5, 'Sci-Fi': 4.2, 'Thriller': 4.14, 'Musical': 0.5, 'Romance': 0.5, 'Fantasy': 3.38, 'Children': 0.5, 'Horror': 4.69, 'Mystery': 5.0, 'ratings_count': 107.0, 'ratings_ave': 2.43}}


In [57]:
# Moving the ratings_count and ratings_avg to the beginning. Helful to decide from which coloum is the staring point for a vector in the dataset 
new_user_to_genre = {}

for user_id, genres in user_to_genre.items():
    new_genres_order = {}

    if 'ratings_count' in genres:
        new_genres_order['ratings_count'] = genres['ratings_count']
    if 'ratings_ave' in genres:
        new_genres_order['ratings_ave'] = genres['ratings_ave']
        
    for key, value in genres.items():
        if key not in ['ratings_count', 'ratings_ave']:
            new_genres_order[key] = value
    
    new_user_to_genre[user_id] = new_genres_order
user_to_genre = new_user_to_genre

In [58]:
user_to_genre_expanded = user_to_genre
for _, row in movieId_ratings_userId_merged.iterrows():
    movieId = row['movieId']
    userId = row['userId']
    rating = row['rating']

    if 'movies' not in user_to_genre_expanded[userId]:
        user_to_genre[userId]['movies'] = {}  # or defaultdict(int) if you prefer
    
    # Now you can add the movie ID and rating to the 'movies' dictionary
    user_to_genre_expanded[userId]['movies'][movieId] = rating 
    
    
    

In [60]:

first_three = dict(list(user_to_genre_expanded.items())[:3])
print(first_three)

{1: {'ratings_count': 689.0, 'ratings_ave': 4.35, 'Adventure': 4.39, 'Animation': 4.69, 'Children': 4.55, 'Comedy': 4.28, 'Fantasy': 4.3, 'Romance': 4.31, 'Action': 4.32, 'Crime': 4.36, 'Thriller': 4.15, 'Mystery': 4.17, 'Horror': 3.47, 'Drama': 4.53, 'War': 4.5, 'Sci-Fi': 4.22, 'Musical': 4.68, 'movies': {1: 4.0, 3: 4.0, 6: 4.0, 47: 5.0, 50: 5.0, 70: 3.0, 101: 5.0, 110: 4.0, 151: 5.0, 157: 5.0, 163: 5.0, 216: 5.0, 223: 3.0, 231: 5.0, 235: 4.0, 260: 5.0, 296: 3.0, 316: 3.0, 333: 5.0, 349: 4.0, 356: 4.0, 362: 5.0, 367: 4.0, 423: 3.0, 441: 4.0, 457: 5.0, 480: 4.0, 500: 3.0, 527: 5.0, 543: 4.0, 552: 4.0, 553: 5.0, 590: 4.0, 592: 4.0, 593: 4.0, 596: 5.0, 608: 5.0, 648: 3.0, 661: 5.0, 673: 3.0, 733: 4.0, 736: 3.0, 780: 3.0, 804: 4.0, 919: 5.0, 923: 5.0, 940: 5.0, 943: 4.0, 954: 5.0, 1009: 3.0, 1023: 5.0, 1024: 5.0, 1025: 5.0, 1029: 5.0, 1030: 3.0, 1031: 5.0, 1032: 5.0, 1042: 4.0, 1049: 5.0, 1060: 4.0, 1073: 5.0, 1080: 5.0, 1089: 5.0, 1090: 4.0, 1092: 5.0, 1097: 5.0, 1127: 4.0, 1136: 5.0, 11

In [59]:
first_three = dict(list(user_to_genre.items())[:3])
print(first_three)

{1: {'ratings_count': 689.0, 'ratings_ave': 4.35, 'Adventure': 4.39, 'Animation': 4.69, 'Children': 4.55, 'Comedy': 4.28, 'Fantasy': 4.3, 'Romance': 4.31, 'Action': 4.32, 'Crime': 4.36, 'Thriller': 4.15, 'Mystery': 4.17, 'Horror': 3.47, 'Drama': 4.53, 'War': 4.5, 'Sci-Fi': 4.22, 'Musical': 4.68, 'movies': {1: 4.0, 3: 4.0, 6: 4.0, 47: 5.0, 50: 5.0, 70: 3.0, 101: 5.0, 110: 4.0, 151: 5.0, 157: 5.0, 163: 5.0, 216: 5.0, 223: 3.0, 231: 5.0, 235: 4.0, 260: 5.0, 296: 3.0, 316: 3.0, 333: 5.0, 349: 4.0, 356: 4.0, 362: 5.0, 367: 4.0, 423: 3.0, 441: 4.0, 457: 5.0, 480: 4.0, 500: 3.0, 527: 5.0, 543: 4.0, 552: 4.0, 553: 5.0, 590: 4.0, 592: 4.0, 593: 4.0, 596: 5.0, 608: 5.0, 648: 3.0, 661: 5.0, 673: 3.0, 733: 4.0, 736: 3.0, 780: 3.0, 804: 4.0, 919: 5.0, 923: 5.0, 940: 5.0, 943: 4.0, 954: 5.0, 1009: 3.0, 1023: 5.0, 1024: 5.0, 1025: 5.0, 1029: 5.0, 1030: 3.0, 1031: 5.0, 1032: 5.0, 1042: 4.0, 1049: 5.0, 1060: 4.0, 1073: 5.0, 1080: 5.0, 1089: 5.0, 1090: 4.0, 1092: 5.0, 1097: 5.0, 1127: 4.0, 1136: 5.0, 11

In [61]:
with open("data/user_to_genre_file.pickle", "wb") as f:
    pickle.dump(user_to_genre_expanded, f)

print("Pickle file created: user_to_genre_file.pickle") # to avoid computing again when we want to reuse 

Pickle file created: user_to_genre_file.pickle


In [262]:
print(len(user_to_genre)) 

610


In [272]:
# converting the user vectors(user_to_genre) to data frame
user_to_genre_df = pd.DataFrame.from_dict(user_to_genre, orient='index')
user_to_genre_df = user_to_genre_df.reset_index(names=['userId'])
user_to_genre_df['userId'] = user_to_genre_df['userId'].astype(float)
user_to_genre_df = user_to_genre_df.fillna(0.0)
user_to_genre_df


Unnamed: 0,userId,ratings_count,ratings_ave,Adventure,Animation,Children,Comedy,Fantasy,Romance,Action,Crime,Thriller,Mystery,Horror,Drama,War,Sci-Fi,Musical,Documentary
0,1.0,689.0,4.35,4.39,4.69,4.55,4.28,4.30,4.31,4.32,4.36,4.15,4.17,3.47,4.53,4.50,4.22,4.68,0.00
1,2.0,70.0,3.91,4.17,0.00,0.00,4.00,0.00,4.50,3.95,3.80,3.70,4.00,3.00,3.88,4.50,3.88,0.00,4.33
2,3.0,107.0,2.43,2.73,0.50,0.50,1.00,3.38,0.50,3.57,0.50,4.14,5.00,4.69,0.75,0.50,4.20,0.50,0.00
3,4.0,500.0,3.53,3.66,4.00,3.80,3.51,3.68,3.38,3.32,3.81,3.55,3.48,4.25,3.48,3.57,2.83,4.00,4.00
4,5.0,123.0,3.65,3.25,4.33,4.11,3.47,4.14,3.09,3.11,3.83,3.56,4.00,3.00,3.80,3.33,2.50,4.40,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606.0,2628.0,3.64,3.50,3.71,3.45,3.57,3.60,3.74,3.18,3.65,3.53,3.79,3.35,3.79,3.79,3.56,3.73,3.80
606,607.0,516.0,3.75,3.47,3.33,3.42,3.33,3.57,3.52,3.72,3.81,4.11,4.65,4.11,4.01,4.17,3.25,3.60,0.00
607,608.0,2249.0,3.20,3.22,3.12,2.46,2.74,3.00,2.89,3.33,3.61,3.54,3.55,3.32,3.44,3.58,3.30,2.76,3.00
608,609.0,89.0,3.26,3.20,3.00,3.00,3.29,3.00,3.20,3.09,3.50,3.29,0.00,3.50,3.37,3.50,3.00,0.00,3.00


In [264]:
movies_info = defaultdict(list)

for _, row in movieId_ratings_userId_merged.iterrows():
    movie_id = row["movieId"]
    rating = row["rating"]
    movies_info[movie_id].append(rating)

In [266]:
avg_ratings_for_a_movie = {}

for movieid, ratings_list in movies_info.items():
    avg_ratings_for_a_movie[movieid] = sum(ratings_list)/len(ratings_list)

df_movies_info_only = movieId_ratings_userId_merged.drop_duplicates(subset=['movieId'])
df_movies_info_only = df_movies_info_only.drop(columns=['userId', 'timestamp', 'rating'])

In [267]:
genres_info = defaultdict(lambda : defaultdict(int))
year_info = {}
                          
genres_list = ["Adventure",	"Animation","Children", "Comedy","Fantasy","Romance","Action","Crime","Thriller","Mystery"
               ,"Horror",	"Drama","War","Sci-Fi",	"Musical",	"Documentary"]

for _, row in df_movies_info_only.iterrows():
    movie_id = row["movieId"] 
    genres = row["genres"].split('|')
    movie_title = row["title"]
    year_match = re.search(r'\((\d{4})\)', movie_title)
    
    if year_match: 
        movie_year = int(year_match.group(1)) 
    else:
        movie_year = None 

    year_info[movie_id] = movie_year
    for genre in genres:
        if genre in genres_list:
            genres_info[movie_id][genre] = 1
    
         

In [268]:
df_genresinfo = (
    pd.DataFrame.from_dict(genres_info, orient = 'index')
    .reset_index()
    .rename(columns={'index': 'movieId'})
    .fillna(0)
) 

df_yearinfo = (
    pd.DataFrame.from_dict(year_info, orient='index', columns=['Year']) 
    .reset_index()
    .rename(columns={'index': 'movieId'})
)

df_avg_ratings = (
    pd.DataFrame.from_dict(avg_ratings_for_a_movie, orient = 'index')
    .reset_index()
    .rename(columns={'index': 'movieId', 0 : 'Avg_rating'})
)



In [269]:
df_year_and_avg = df_yearinfo.merge(df_avg_ratings , on = 'movieId')
df_movie_and_genres_info = df_year_and_avg.merge(df_genresinfo , on = 'movieId')
df_movie_and_genres_info['Year'] = df_movie_and_genres_info['Year'].fillna(0).astype(int) 
df_movie_and_genres_info[genres_list] = df_movie_and_genres_info[genres_list].astype(int)
df_movie_and_genres_info


Unnamed: 0,movieId,Year,Avg_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Action,Crime,Thriller,Mystery,Horror,Drama,War,Sci-Fi,Musical,Documentary
0,1,1995,3.920930,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
1,3,1995,3.259615,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
2,6,1995,3.946078,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
3,47,1995,3.975369,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
4,50,1995,4.237745,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9659,160341,1997,2.500000,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
9660,160527,1971,4.500000,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0
9661,160836,2005,3.000000,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0
9662,163937,2016,3.500000,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [271]:
training_set1 = df_ratings.merge(df_movie_and_genres_info, on='movieId', how='right')
training_set1.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,Year,Avg_rating,Adventure,Animation,Children,Comedy,...,Action,Crime,Thriller,Mystery,Horror,Drama,War,Sci-Fi,Musical,Documentary
0,1,1,4.0,964982703,1995,3.92093,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,847434962,1995,3.92093,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,1106635946,1995,3.92093,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,1510577970,1995,3.92093,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,1305696483,1995,3.92093,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [292]:
training_set2 =  user_to_genre_df.merge(training_set1, on='userId', how='right')
training_set2 = training_set2.drop(columns = 'timestamp', axis = 1)

print(list(training_set2))

['userId', 'ratings_count', 'ratings_ave', 'Adventure_x', 'Animation_x', 'Children_x', 'Comedy_x', 'Fantasy_x', 'Romance_x', 'Action_x', 'Crime_x', 'Thriller_x', 'Mystery_x', 'Horror_x', 'Drama_x', 'War_x', 'Sci-Fi_x', 'Musical_x', 'Documentary_x', 'movieId', 'rating', 'Year', 'Avg_rating', 'Adventure_y', 'Animation_y', 'Children_y', 'Comedy_y', 'Fantasy_y', 'Romance_y', 'Action_y', 'Crime_y', 'Thriller_y', 'Mystery_y', 'Horror_y', 'Drama_y', 'War_y', 'Sci-Fi_y', 'Musical_y', 'Documentary_y']


In [294]:
print(training_set2[['userId', 'movieId','rating']].head(20))

    userId  movieId  rating
0      1.0        1     4.0
1      5.0        1     4.0
2      7.0        1     4.5
3     15.0        1     2.5
4     17.0        1     4.5
5     18.0        1     3.5
6     19.0        1     4.0
7     21.0        1     3.5
8     27.0        1     3.0
9     31.0        1     5.0
10    32.0        1     3.0
11    33.0        1     3.0
12    40.0        1     5.0
13    43.0        1     5.0
14    44.0        1     3.0
15    45.0        1     4.0
16    46.0        1     5.0
17    50.0        1     3.0
18    54.0        1     3.0
19    57.0        1     5.0


In [295]:
print(len(training_set2))

100631


In [318]:
user_training_set = training_set2.iloc[:, :19]  # First two columns
item_training_set = training_set2.iloc[:, 19:]
y_training_set = item_training_set['rating']
item_training_set = item_training_set.drop('rating', axis = 1)

In [319]:
user_training_set.head()
user_training_set.to_csv('data/user_training_set.csv', index = False, header=False)

In [320]:
y_training_set.head()
y_training_set.to_csv('data/y_training_set.csv', index = False, header=False)

In [321]:
item_training_set.head()
item_training_set.to_csv('data/item_training_set.csv', index = False, header=False)

In [322]:
item_training_set.head()

Unnamed: 0,movieId,Year,Avg_rating,Adventure_y,Animation_y,Children_y,Comedy_y,Fantasy_y,Romance_y,Action_y,Crime_y,Thriller_y,Mystery_y,Horror_y,Drama_y,War_y,Sci-Fi_y,Musical_y,Documentary_y
0,1,1995,3.92093,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1995,3.92093,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1995,3.92093,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
3,1,1995,3.92093,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
4,1,1995,3.92093,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
