In [1]:
import polars as pl
import os
import glob
import numpy as np
import seaborn as sns


## Read in Data

In [3]:
def get_data(url, file_name):
    os.system(f'curl -kLs {url} -o {file_name}')

get_data('https://files.grouplens.org/datasets/movielens/ml-25m-README.html', 'instructions.md')
get_data('https://files.grouplens.org/datasets/movielens/ml-25m.zip', 'dataset.zip')

In [2]:
def read_data(file_path):
    pl.read_csv(file_path)

In [3]:
def load_datasets(folder_path):
    files = glob.glob(f'{folder_path}/*.csv')
    list_of_files = [file.split('/')[-1].split('.')[0] for file in files]
    dict_of_dfs = {name: pl.read_csv(file) for name, file in zip(list_of_files, files)}
    return dict_of_dfs


In [4]:
dfs = load_datasets('ml-25m')

In [5]:
dfs.keys()

dict_keys(['genome-scores', 'genome-tags', 'links', 'movies', 'ratings', 'tags'])

In [6]:
[print(key, value) for key, value in dfs.items()]

genome-scores shape: (15_584_448, 3)
┌─────────┬───────┬───────────┐
│ movieId ┆ tagId ┆ relevance │
│ ---     ┆ ---   ┆ ---       │
│ i64     ┆ i64   ┆ f64       │
╞═════════╪═══════╪═══════════╡
│ 1       ┆ 1     ┆ 0.02875   │
│ 1       ┆ 2     ┆ 0.02375   │
│ 1       ┆ 3     ┆ 0.0625    │
│ 1       ┆ 4     ┆ 0.07575   │
│ …       ┆ …     ┆ …         │
│ 206499  ┆ 1125  ┆ 0.0485    │
│ 206499  ┆ 1126  ┆ 0.01325   │
│ 206499  ┆ 1127  ┆ 0.14025   │
│ 206499  ┆ 1128  ┆ 0.0335    │
└─────────┴───────┴───────────┘
genome-tags shape: (1_128, 2)
┌───────┬──────────────┐
│ tagId ┆ tag          │
│ ---   ┆ ---          │
│ i64   ┆ str          │
╞═══════╪══════════════╡
│ 1     ┆ 007          │
│ 2     ┆ 007 (series) │
│ 3     ┆ 18th century │
│ 4     ┆ 1920s        │
│ …     ┆ …            │
│ 1125  ┆ wuxia        │
│ 1126  ┆ wwii         │
│ 1127  ┆ zombie       │
│ 1128  ┆ zombies      │
└───────┴──────────────┘
links shape: (62_423, 3)
┌─────────┬─────────┬────────┐
│ movieId ┆ imdbId  ┆ 

[None, None, None, None, None, None]

## Understand the Data

In [7]:
[print(f'\n{key} \n {value.to_pandas().info()}\n') for key, value in dfs.items()]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15584448 entries, 0 to 15584447
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 356.7 MB

genome-scores 
 None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB

genome-tags 
 None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  62423 non-null  int64  
 1   imdbId   62423 non-null  int64  
 2   tmdbId   62316 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB

links 
 None

<class

[None, None, None, None, None, None]

In [8]:
def get_unique(df):
    for col in df.columns:
        print(col + ':', '\n')
        print(f'{(np.count_nonzero(df[col].unique())/df[col].shape[0])*100:.2f}% unique values.', '\n')

In [9]:
for table, df in dfs.items():
    print('*'*100)
    print(table)
    print('*'*100)
    get_unique(df)

****************************************************************************************************
genome-scores
****************************************************************************************************
movieId: 

0.09% unique values. 

tagId: 

0.01% unique values. 

relevance: 

0.03% unique values. 

****************************************************************************************************
genome-tags
****************************************************************************************************
tagId: 

100.00% unique values. 

tag: 

100.00% unique values. 

****************************************************************************************************
links
****************************************************************************************************
movieId: 

100.00% unique values. 

imdbId: 

100.00% unique values. 

tmdbId: 

99.77% unique values. 

*****************************************************************************************

Movies Table

In [12]:
movies = dfs['movies']
movies

movieId,title,genres
i64,str,str
1,"""Toy Story (199…","""Adventure|Anim…"
2,"""Jumanji (1995)…","""Adventure|Chil…"
3,"""Grumpier Old M…","""Comedy|Romance…"
4,"""Waiting to Exh…","""Comedy|Drama|R…"
5,"""Father of the …","""Comedy"""
6,"""Heat (1995)""","""Action|Crime|T…"
7,"""Sabrina (1995)…","""Comedy|Romance…"
8,"""Tom and Huck (…","""Adventure|Chil…"
9,"""Sudden Death (…","""Action"""
10,"""GoldenEye (199…","""Action|Adventu…"


In [None]:
# get a unique list of all genres
