In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix

In [None]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_links = pd.read_csv('links.csv')
df_tags = pd.read_csv('tags.csv')

In [None]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [None]:
df_links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
df_tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


In [None]:
print("Length of tags",len(df_tags))
print("Length of movies",len(df_movies))
print("Length of ratings",len(df_ratings))
print("Length of links",len(df_links))

Length of tags 2000072
Length of movies 87585
Length of ratings 32000204
Length of links 87585


**Removing duplicates**

In [None]:
df_movies_no_duplicates = df_movies.drop_duplicates()
num_duplicates = len(df_movies) - len(df_movies_no_duplicates)
print(f"Number of duplicates: {num_duplicates}")

Number of duplicates: 0


In [None]:
df_ratings_no_duplicates = df_ratings.drop_duplicates()
num_duplicates = len(df_ratings) - len(df_ratings_no_duplicates)
print(f"Number of duplicates: {num_duplicates}")

Number of duplicates: 0


In [None]:
df_links_no_duplicates = df_links.drop_duplicates()
num_duplicates = len(df_links) - len(df_links_no_duplicates)
print(f"Number of duplicates: {num_duplicates}")

Number of duplicates: 0


In [None]:
df_tags_no_duplicates = df_tags.drop_duplicates()
num_duplicates = len(df_tags) - len(df_tags_no_duplicates)
print(f"Number of duplicates: {num_duplicates}")

Number of duplicates: 0


In [None]:
merged_link_movie_df = pd.merge(df_movies, df_links, on='movieId', how='inner')
merged_link_movie_df.head(5)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [None]:
print(len(merged_link_movie_df))

87585


In [None]:
merged_link_movie_tags_df = pd.merge(merged_link_movie_df, df_tags, on='movieId', how='inner')
merged_link_movie_tags_df.head(5)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,109,children,1257988285
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,109,Disney,1257988287
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,909,animation,1248249498
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,909,children,1248249511
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,909,Disney,1248249494


In [None]:
print(len(merged_link_movie_tags_df))

2000072


In [None]:
print(merged_link_movie_tags_df.info())
print(df_ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000072 entries, 0 to 2000071
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   imdbId     int64  
 4   tmdbId     float64
 5   userId     int64  
 6   tag        object 
 7   timestamp  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 122.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 976.6 MB
None


In [None]:
df_ratings = df_ratings
df_ratings = df_ratings[['movieId' , 'userId' , 'rating']]
merged_df = pd.merge(merged_link_movie_tags_df, df_ratings, on=on=['movieId', 'userId'], how='inner')
print(merged_df.head(5))
print(len(merged_df))

In [None]:
merged_df['genres'] = merged_df['genres'].str.replace('|', ', ')
merged_df['genre_tag'] = merged_df['genres'] + ', ' + merged_df['tag']
print(merged_df.head(5))

In [None]:
merged_df.to_csv("merged_movie_df.csv", index = False)