In [1]:
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

Downloading movielens data...
Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


In [3]:
import pandas as pd

# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

In [10]:
print(ratings)

      user_id movie_id  rating  unix_timestamp
0         195      241     3.0       881250949
1         185      301     3.0       891717742
2          21      376     1.0       878887116
3         243       50     2.0       880606923
4         165      345     1.0       886397596
...       ...      ...     ...             ...
99995     879      475     3.0       880175444
99996     715      203     5.0       879795543
99997     275     1089     1.0       874795795
99998      12      224     2.0       882399156
99999      11      202     3.0       879959583

[100000 rows x 4 columns]


In [12]:
# Sort ratings by user ID

ratings.sort_values(by=['user_id'], ascending=True, na_position='first')

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
12948,0,249,4.0,874965706
4280,0,81,5.0,878542589
31991,0,104,2.0,875240739
83307,0,2,4.0,878542960
15270,0,53,3.0,878543308
...,...,...,...,...
270,99,343,4.0,891374868
15300,99,322,3.0,891375359
42972,99,1232,3.0,891375112
47293,99,899,4.0,891374832


### Split

### Model