# Analysis of the movielens dataset 1m


In [None]:
# Download the data from: https://grouplens.org/datasets/movielens/1m/

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import anndata
import scipy.sparse as sp

In [None]:
# checks that the user ids and movie ids are contiguous
# if they are not, we need to reindex them
def check_contiguous(df):
    """
    Check that all unique values in the first two columns are contiguous.
    """
    # get the unique users and movies
    users = np.sort(np.unique(df[:, 0].to_numpy()[:, 0]))
    movies = np.sort(np.unique(df[:, 1].to_numpy()[:, 0]))
    # check if they are contiguous
    users_contiguous = np.all(np.diff(users) == 1)
    movies_contiguous = np.all(np.diff(movies) == 1)
    print('Users are contiguous: ', users_contiguous)
    print('Movies are contiguous: ', movies_contiguous)
    # print number of users and movies
    print('Number of users: ', len(users))
    print('Number of movies: ', len(movies))
    return users_contiguous, movies_contiguous



def check_valid_index(df):
    """Indexes should be contiguous and start at 0"""
    users_contiguous, movies_contiguous = check_contiguous(df)
    # check if the first user is 0
    users_starts_0 = np.min(df[:, 0].to_numpy()[:, 0]) == 0
    movies_starts_0 = np.min(df[:, 1].to_numpy()[:, 0]) == 0
    print('Users start at 0: ', users_starts_0)
    print('Movies start at 0: ', movies_starts_0)
    return users_contiguous and movies_contiguous and users_starts_0 and movies_starts_0

# reindex the movies and save the mapping
def reindex_movies(df):
    movies = np.unique(df[:, 1].to_numpy()[:, 0])
    print('Number of movies: ', len(movies))
    # create a mapping from old to new ids
    movie_map = {old_id: new_id for new_id, old_id in enumerate(movies)}
    print('Number of movies after reindexing: ', len(movie_map))
    # reindex the movies
    print('Reindexing movies...')
    # create an array to store the new index
    new_index = np.zeros(df.shape[0], dtype=int)
    # set values in new_index based on movie_map
    for i, old_id in enumerate(df[:, 1].to_numpy()[:, 0]):
        new_index[i] = movie_map[old_id]
    print('Number of reindexed movies: ', len(np.unique(new_index)))
    df[:, 1] = new_index
    # switch 1st and 3rd columns
    #df = df[:, [0, 3, 1, 2]]
    return df, movie_map

def check_movie_map(movie_map):
    """Checks that the values of the movie_map are sorted """
    assert np.all(np.diff(np.array(movie_map.values())[None, ]) == 1)
    # sorted movie ids
    k1 = np.array(list(movie_map.keys()))[list(np.argsort(np.unique(list(movie_map.values()))))]
    k2 = np.array(list(movie_map.keys()))
    assert np.all(k1 == k2)


In [None]:
# read the table using datatable
# user id | item id | rating | timestamp. 
df = dt.fread('../../data/movie_lens/ml-1m/ratings.dat', sep=':')
# drop timestamp column
# only keep C0, C2, C4
df = df[:, [0, 2, 4]]

# start users from 0
df[:, 0] = df[:, 0].to_numpy() - 1
df.head()

# get the number of unique users and items as a non-array value
n_users = df[:, 0].nunique().to_numpy()[0][0]
n_items = df[:, 1].nunique().to_numpy()[0][0]
print(n_users, n_items)
# 6,040 users by 3,900 movies (200 fewer movies)

In [None]:
check_valid_index(df)

In [None]:
df, movie_map = reindex_movies(df)    

In [None]:
assert check_valid_index(df), 'Indexing is not valid!'

In [None]:
np.min(df[:, 0].to_numpy()[:, 0]), np.max(df[:, 0].to_numpy()[:, 0]), np.min(df[:, 1].to_numpy()[:, 0]), np.max(df[:, 1].to_numpy()[:, 0])

In [None]:
smat = sp.csr_matrix((df[:, 2].to_numpy().flatten(), (df[:, 0].to_numpy().flatten(), df[:, 1].to_numpy().flatten())), shape=(n_users, n_items))

In [None]:
assert smat.shape[0] == n_users, 'Number of users does not match!'
assert smat.shape[1] == n_items, 'Number of items does not match!'

In [None]:
# create an anndata object
adata = anndata.AnnData(X=smat)
adata.layers['counts'] = adata.X.copy()
adata.obs_names = np.arange(n_users) + 1
check_movie_map(movie_map)
adata.var_names = np.array(list(movie_map.keys()))
adata
# randomly assign 
adata.obs['labels'] = np.random.choice(['A', 'B', 'C'], size=n_users)

In [None]:
adata.write('../../data/movie_lens/ml-1m.h5ad')

In [None]:
# load and add the movie metadata

In [None]:
adata = anndata.read_h5ad('../../data/movie_lens/ml-1m.h5ad')
# read the movies
mdf = dt.fread('path/to/data/movie_lens/ml-1m/movies.dat', sep='[')
mdf.head()

In [None]:
# Create a dictionary from the first two columns
some_dict = {'movie_id': mdf['C0'].to_numpy().flatten(), 'movie_name': mdf['C1'].to_numpy().flatten()}
# Create a pandas data.frame from the dictionary
mdf_pd = pd.DataFrame(some_dict)
mdf_pd.set_index('movie_id', inplace=True)
mdf_pd

In [None]:
# convert to string
mdf_pd.index = mdf_pd.index.astype(str)
adata.var.join(mdf_pd)
# add mdata to adata.var
adata.var = adata.var.join(mdf_pd)
adata.write('../../data/movie_lens/ml-1m.h5ad')

In [None]:
# create the train/validation/split 

#!./driver.py setup_data -i ../data/movie_lens/ml-1m.h5ad -o ../data/movie_lens/processed -p .2 -f True -l 1