In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
movies = pd.read_csv('./data/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('./data/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('./data/users.dat', sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], encoding='ISO-8859-1')

In [None]:
print("Ratings dataset shape:", ratings.shape)
print("Users dataset shape:", users.shape)
print("Movies dataset shape:", movies.shape)

print("Ratings dataset info:")
print(ratings.info())

print("Users dataset info:")
print(users.info())

print("Movies dataset info:")
print(movies.info())

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
users.head()

In [3]:
genres = movies['genres'].str.get_dummies('|')
df = pd.concat([movies, genres],axis=1).merge(ratings,how='left',on='movie_id').drop(columns={'genres'})
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.head()

Unnamed: 0,movie_id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_id,rating,timestamp
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,5.0,2001-01-06 23:37:48
1,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,6.0,4.0,2000-12-31 04:30:08
2,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,4.0,2000-12-31 03:31:36
3,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9.0,5.0,2000-12-31 01:25:52
4,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,10.0,5.0,2000-12-31 01:34:34


In [4]:
df['timestamp'].describe()

count                 1000209
unique                 458455
top       2000-11-29 20:06:42
freq                       30
first     2000-04-25 23:05:32
last      2003-02-28 17:49:50
Name: timestamp, dtype: object

In [5]:
df = df[(~df['rating'].isna()) & (~df['user_id'].isna()) & (~df['timestamp'].isna())]
df.isnull().sum()

movie_id       0
title          0
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
user_id        0
rating         0
timestamp      0
dtype: int64

In [6]:
df['user_id'] = df['user_id'].astype('str')
df['rating'] = df['rating'].astype('int64')
df['rating'].describe()

count    1.000209e+06
mean     3.581564e+00
std      1.117102e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000385
Data columns (total 23 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   movie_id     1000209 non-null  int64         
 1   title        1000209 non-null  object        
 2   Action       1000209 non-null  int64         
 3   Adventure    1000209 non-null  int64         
 4   Animation    1000209 non-null  int64         
 5   Children's   1000209 non-null  int64         
 6   Comedy       1000209 non-null  int64         
 7   Crime        1000209 non-null  int64         
 8   Documentary  1000209 non-null  int64         
 9   Drama        1000209 non-null  int64         
 10  Fantasy      1000209 non-null  int64         
 11  Film-Noir    1000209 non-null  int64         
 12  Horror       1000209 non-null  int64         
 13  Musical      1000209 non-null  int64         
 14  Mystery      1000209 non-null  int64         
 15  Romance      10

In [None]:
df.groupby(['movie_id','title']).agg({'rating':'count'}).reset_index().sort_values(by='rating', ascending=False)

In [None]:
sns.histplot(df['rating'], bins=5)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [8]:
movie_features = df[['movie_id', 'Action', 'Adventure', 'Animation', "Children's",
                     'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]

movie_features.head()

Unnamed: 0,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
users = users.drop(["zip_code"], axis = 1)

# Create age groups
bins = [0, 20, 30, 40, 50, 60, np.inf]
ages = ['<20', '20-29', '30-39','40-49', '51-60', '60+']
users['age_group'] = pd.cut(users['age'], bins, labels=ages)
users = users.drop(["age"], axis = 1)

occupation_map = {
    0: "other or not specified",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}

# Map the occupation IDs to occupation names in the users dataframe
users['occupation'] = users['occupation'].map(occupation_map)

users.head()

Unnamed: 0,user_id,gender,occupation,age_group
0,1,F,K-12 student,<20
1,2,M,self-employed,51-60
2,3,M,scientist,20-29
3,4,M,executive/managerial,40-49
4,5,M,writer,20-29


In [10]:
# Generate user features as a sparse matrix
columnsToEncode = ['age_group','gender','occupation']
myEncoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
myEncoder.fit(users[columnsToEncode])

user_features = pd.concat([users.drop(columnsToEncode, 1), 
                           pd.DataFrame(myEncoder.transform(users[columnsToEncode]), 
                                        columns = myEncoder.get_feature_names_out(columnsToEncode))], axis=1).reindex()

user_features.head()

Unnamed: 0,user_id,age_group_20-29,age_group_30-39,age_group_40-49,age_group_51-60,age_group_<20,gender_F,gender_M,occupation_K-12 student,occupation_academic/educator,occupation_artist,occupation_clerical/admin,occupation_college/grad student,occupation_customer service,occupation_doctor/health care,occupation_executive/managerial,occupation_farmer,occupation_homemaker,occupation_lawyer,occupation_other or not specified,occupation_programmer,occupation_retired,occupation_sales/marketing,occupation_scientist,occupation_self-employed,occupation_technician/engineer,occupation_tradesman/craftsman,occupation_unemployed,occupation_writer
0,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
