In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('/workspaces/LLM-Recommender-System/data/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('/workspaces/LLM-Recommender-System/data/ml-latest-small/ratings.csv')
ratings.userId.nunique()

610

In [4]:
tags = pd.read_csv('/workspaces/LLM-Recommender-System/data/ml-latest-small/tags.csv')
tags.sort_values(by=['userId', 'movieId'], inplace=True)

## Group by 'userId' and 'movieId' and concatenate the tags
tags_grouped = tags.groupby(['userId', 'movieId']).agg({'tag': ', '.join}).reset_index()

# Display the resulting DataFrame
tags_grouped.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,"funny, Highly quotable, will ferrell"
1,2,89774,"Boxing story, MMA, Tom Hardy"
2,2,106782,"drugs, Leonardo DiCaprio, Martin Scorsese"
3,7,48516,way too long
4,18,431,"Al Pacino, gangster, mafia"


In [5]:
# merge movies ratings and tags
movie_ratings = pd.merge(movies, ratings, on='movieId')
movie_ratings_tags = pd.merge(movie_ratings, tags_grouped, on=['userId', 'movieId'], how='left')
movie_ratings_tags.sort_values(by=['userId', 'movieId'], inplace=True)
movie_ratings_tags.reset_index(drop=True, inplace=True)
# change the order of columns
movie_ratings_tags = movie_ratings_tags[['userId', 'movieId', 'title', 'genres', 'rating', 'timestamp', 'tag']]
# how many tags are missing in percentage
tags_missing = movie_ratings_tags.tag.isnull().sum() / len(movie_ratings_tags) * 100
print(f'Tags missing: {tags_missing:.2f}%')
# replace the | separator with , in genres
movie_ratings_tags.genres = movie_ratings_tags.genres.str.replace('|', ', ')
movie_ratings_tags.head()


Tags missing: 98.38%


Unnamed: 0,userId,movieId,title,genres,rating,timestamp,tag
0,1,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy",4.0,964982703,
1,1,3,Grumpier Old Men (1995),"Comedy, Romance",4.0,964981247,
2,1,6,Heat (1995),"Action, Crime, Thriller",4.0,964982224,
3,1,47,Seven (a.k.a. Se7en) (1995),"Mystery, Thriller",5.0,964983815,
4,1,50,"Usual Suspects, The (1995)","Crime, Mystery, Thriller",5.0,964982931,


In [6]:
movie_ratings_tags.to_csv('/workspaces/LLM-Recommender-System/data/ml-latest-small/processed_movie_latest_small.csv', index=False)

## add wiki

In [8]:
import wikipedia

In [11]:
print(movies.movieId.nunique())
movies.head()

9742


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
movie_data = []

for title in np.unique(movies['title']):
    try:
        # Formatting the title for Wikipedia
        title_formatted = title.replace("'", "").rstrip()
        if "(" in title_formatted and ")" in title_formatted:
            title_formatted = title_formatted.replace(")", " film)")  # Add 'film' before the closing parenthesis
        else:
            title_formatted += " (film)"

        # Fetching the summary
        summary = wikipedia.summary(title_formatted, sentences=1)
        movie_data.append({'movie_title': title, 'wiki_summary': summary})
    except wikipedia.DisambiguationError as e:
        # Handle disambiguation by looking for a title containing 'film'
        relevant_page = next((option for option in e.options if "film" in option and title.split(" (")[0] in option), None)
        if relevant_page:
            try:
                summary = wikipedia.summary(relevant_page, sentences=1)
                movie_data.append({'movie_title': title, 'wiki_summary': summary})
            except Exception as ex:
                print(f"Failed to fetch summary for {title}: {ex}")
        else:
            print(f"No relevant Wikipedia page found for {title}")
    except Exception as ex:
        print(f"Failed to fetch summary for {title}: {ex}")

# Creating the DataFrame after collecting all data
movie_wiki = pd.DataFrame(movie_data)

Failed to fetch summary for 'Round Midnight (1986): Page id "round midnight 1989 film" does not match any pages. Try another id!
Failed to fetch summary for 'Salem's Lot (2004): Page id "salem's lot 2004 film" does not match any pages. Try another id!
Failed to fetch summary for 'burbs, The (1989): Page id "burns the 1984 film" does not match any pages. Try another id!
Failed to fetch summary for 101 Dalmatians II: Patch's London Adventure (2003): Page id "101 dalmatians ii patch's london adventure 2008 film" does not match any pages. Try another id!
Failed to fetch summary for 11'09"01 - September 11 (2002): Page id "1109"01 - September 11 (2002 film)" does not match any pages. Try another id!
Failed to fetch summary for 12 Chairs (1971): Page id "12 chairs 1976 film" does not match any pages. Try another id!
Failed to fetch summary for 2046 (2004): Page id "2006 2007 film" does not match any pages. Try another id!
Failed to fetch summary for 3 Extremes (Three... Extremes) (Saam gaang



  lis = BeautifulSoup(html).find_all('li')


No relevant Wikipedia page found for 61* (2001)
Failed to fetch summary for A Midsummer Night's Dream (2016): Page id "a midsummer night's dream 2018 film" does not match any pages. Try another id!
Failed to fetch summary for Absentia (2011): Page id "absent 2011 film" does not match any pages. Try another id!
Failed to fetch summary for Absolute Power (1997): Page id "absolute poker 1999 film" does not match any pages. Try another id!
Failed to fetch summary for Accidental Spy, The (Dak miu mai shing) (2001): Page id "Accidental Spy, The (Dak miu mai shing film) (2001 film)" does not match any pages. Try another id!
Failed to fetch summary for Adam's Apples (Adams æbler) (2005): Page id "adam's apples adams æbler film 2009 film" does not match any pages. Try another id!
Failed to fetch summary for Adam's Rib (1949): Page id "adam's rib 1989 film" does not match any pages. Try another id!
Failed to fetch summary for Adventures in Plymptoons! (2011): Page id "adventures in plympton 2011

In [13]:
movie_wiki.to_csv('/workspaces/LLM-Recommender-System/data/ml-latest-small/movie_wiki.csv', index=False)