## Loading Dependencies

In [22]:
import pandas as pd


In [23]:
# path to the folder containing movielens data
Path = "D:/Canada/Danial/UoW/Dataset/MovieLens/100K/ml-100k"

## Loading users information and creating user_info column

In [24]:
# Loading the user data
user_df = pd.read_csv(f'{Path}/u.user', 
                      sep='|', 
                      names=["user_id", "age", "gender", "occupation", "zip_code"], 
                      encoding='latin-1')


# loading Movies
user_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [25]:

# Function to format the user info
def format_user_info(row):
    # Map gender from 'M'/'F' to 'Male'/'Female'
    gender = 'Male' if row['gender'] == 'M' else 'Female'
    # Create the formatted string
    user_info = f"""
    Age: {row['age']}
    Gender: {gender}
    Occupation: {row['occupation']}
    """
    return user_info

# Apply the function to create the 'user_info' column
user_df['user_info'] = user_df.apply(format_user_info, axis=1)

user_df.head()


Unnamed: 0,user_id,age,gender,occupation,zip_code,user_info
0,1,24,M,technician,85711,\n Age: 24\n Gender: Male\n Occupatio...
1,2,53,F,other,94043,\n Age: 53\n Gender: Female\n Occupat...
2,3,23,M,writer,32067,\n Age: 23\n Gender: Male\n Occupatio...
3,4,24,M,technician,43537,\n Age: 24\n Gender: Male\n Occupatio...
4,5,33,F,other,15213,\n Age: 33\n Gender: Female\n Occupat...


## Loading movies information and creating movie_info column: 

In [27]:
# Load the item data
item_df = pd.read_csv(f'{Path}/u.item', sep='|', names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')

item_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Using TMDB API to extract more information about the movies

In [28]:
import requests
import pandas as pd
import time
import re
from tqdm import tqdm


API_KEY = "82e3fe594a0a2129e23d5f89daac9091"

def get_movie_details(title, api_key):
    """Fetch movie details from TMDb API given a movie title, potentially including a year."""
    # Extract the year and clean title if present in the format "Title (Year)"
    match = re.match(r"^(.*?)\s*\((\d{4})\)$", title)
    if match:
        clean_title, year = match.groups()
    else:
        clean_title, year = title, None
    
    try:
        # Modify the URL to include the year in the search if it's available
        url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={clean_title}"
        if year:
            url += f"&year={year}"
        response = requests.get(url)
        data = response.json()
        if data['results']:
            movie_id = data['results'][0]['id']  # Assuming the first result is the correct movie
            details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
            details_response = requests.get(details_url)
            return details_response.json()  # Returns a dictionary of details
        else:
            return {}
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        time.sleep(10)  # Wait for 10 seconds before retrying
        return get_movie_details(title, api_key)


# # Using tqdm for a progress bar
item_df['details'] = [get_movie_details(title, API_KEY) for title in tqdm(item_df['title'], desc="Fetching movie details")]

# Expand 'details' dict into separate columns, excluding the duplicate 'title'
details_df = item_df['details'].apply(pd.Series)

# Drop the "title" column from details_df if it exists
details_df.drop(columns=['title'], errors='ignore', inplace=True)

# Concatenate the original DataFrame and the expanded details DataFrame
item_df = pd.concat([item_df.drop(['details'], axis=1), details_df], axis=1)


Fetching movie details: 100%|██████████| 1682/1682 [08:03<00:00,  3.48it/s]


In [29]:
item_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,production_countries,release_date.1,revenue,runtime,spoken_languages,status,tagline,video,vote_average,vote_count
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,394436586.0,81.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Hang on for the comedy that goes to infinity a...,False,7.973,17828.0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",1995-11-16,352200000.0,130.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,No limits. No fears. No substitutes.,False,6.892,3704.0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,4257354.0,98.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Twelve outrageous guests. Four scandalous requ...,False,5.824,2556.0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-20,115101622.0,105.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Attitude plays a part.,False,6.463,985.0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-27,32000000.0,124.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,One man is copying the most notorious killers ...,False,6.501,902.0


In [30]:
# Filter out rows where the 'overview' column is NaN
cleaned_df = item_df.dropna(subset=['overview'])

# Saving the cleaned DataFrame
cleaned_df.to_csv('movies_enriched_dataset.csv', index=False)

print("Cleaned dataset saved. Number of movies with overviews:", cleaned_df.shape[0])

Cleaned dataset saved. Number of movies with overviews: 1591


In [31]:
cleaned_df.to_pickle('movies_enriched_dataset.pkl')

## Loading enriched dataset and creating movie_info col

In [32]:

# Define genre columns
genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Function to extract and format the genres
def format_movie_info(row):
    genres = [genre for genre in genre_columns if row[genre] == 1]
    genre_str = ', '.join(genres)
    movie_info = f"""
    Movie title: {row['title']} 
    Genre: {genre_str}
    Overview: {row['overview']} 
    """
    return movie_info

# Apply the function to create the 'movie_info' column
cleaned_df['movie_info'] = cleaned_df.apply(format_movie_info, axis=1)
cleaned_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['movie_info'] = cleaned_df.apply(format_movie_info, axis=1)


Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,release_date.1,revenue,runtime,spoken_languages,status,tagline,video,vote_average,vote_count,movie_info
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,1995-10-30,394436586.0,81.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Hang on for the comedy that goes to infinity a...,False,7.973,17828.0,\n Movie title: Toy Story (1995) \n Genr...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,1995-11-16,352200000.0,130.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,No limits. No fears. No substitutes.,False,6.892,3704.0,\n Movie title: GoldenEye (1995) \n Genr...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,1995-12-09,4257354.0,98.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Twelve outrageous guests. Four scandalous requ...,False,5.824,2556.0,\n Movie title: Four Rooms (1995) \n Gen...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,1995-10-20,115101622.0,105.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Attitude plays a part.,False,6.463,985.0,\n Movie title: Get Shorty (1995) \n Gen...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1995-10-27,32000000.0,124.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,One man is copying the most notorious killers ...,False,6.501,902.0,\n Movie title: Copycat (1995) \n Genre:...


In [33]:

# Saving the cleaned DataFrame

cleaned_df.to_pickle('movies_enriched_dataset.pkl')
# user_df.to_pickle('user_dataset.pkl')



In [53]:
movies_without_overview = item_df[item_df['overview'].isna()]

movies_without_overview.head(100)

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,release_date.1,revenue,runtime,spoken_languages,status,tagline,title.1,video,vote_average,vote_count
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,,,,,,,,,,
43,44,Dolores Claiborne (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Dolores%20Cla...,0,0,0,0,0,...,,,,,,,,,,
74,75,Brother Minister: The Assassination of Malcolm...,01-Jan-1994,,http://us.imdb.com/M/title-exact?Brother%20Min...,0,0,0,0,0,...,,,,,,,,,,
162,163,"Return of the Pink Panther, The (1974)",01-Jan-1974,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,0,0,0,0,...,,,,,,,,,,
167,168,Monty Python and the Holy Grail (1974),01-Jan-1974,,http://us.imdb.com/M/title-exact?Monty%20Pytho...,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,1639,Bitter Sugar (Azucar Amargo) (1996),22-Nov-1996,,http://us.imdb.com/M/title-exact?Bitter%20Suga...,0,0,0,0,0,...,,,,,,,,,,
1655,1656,Little City (1998),20-Feb-1998,,http://us.imdb.com/M/title-exact?Little+City+(...,0,0,0,0,0,...,,,,,,,,,,
1659,1660,Small Faces (1995),09-Aug-1996,,http://us.imdb.com/M/title-exact?Small%20Faces...,0,0,0,0,0,...,,,,,,,,,,
1666,1667,"Next Step, The (1995)",13-Jun-1997,,http://us.imdb.com/M/title-exact?Next%20Step%2...,0,0,0,0,0,...,,,,,,,,,,


## Loading ratings information

In [39]:
# Load the ratings data
rating_df = pd.read_csv(f'{Path}/u1.base', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='latin-1')
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   user_id    80000 non-null  int64
 1   movie_id   80000 non-null  int64
 2   rating     80000 non-null  int64
 3   timestamp  80000 non-null  int64
dtypes: int64(4)
memory usage: 2.4 MB


In [42]:
rating_df.groupby('user_id').count().sort_index(ascending=True)

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,135,135,135
2,40,40,40
3,28,28,28
4,14,14,14
5,91,91,91
...,...,...,...
939,49,49,49
940,107,107,107
941,22,22,22
942,79,79,79


In [46]:
rating_df[rating_df['user_id'] == 1].head(100)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
95,1,179,3,875072370
96,1,181,5,874965739
97,1,182,4,875072520
98,1,187,4,874965678


In [43]:
rating_test_df = pd.read_csv(f'{Path}/u1.test', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='latin-1')
rating_test_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [44]:
rating_test_df.groupby('user_id').count()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,137,137,137
2,22,22,22
3,26,26,26
4,10,10,10
5,84,84,84
...,...,...,...
457,8,8,8
458,3,3,3
459,1,1,1
460,1,1,1


In [47]:
rating_test_df[rating_df['user_id'] == 1].head(100)

  rating_test_df[rating_df['user_id'] == 1].head(100)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198
...,...,...,...,...
95,1,202,5,875072442
96,1,206,4,876893205
97,1,208,5,878542960
98,1,209,4,888732908
