In [1]:
import sys
import os

# Manually set the project root relative to this notebook
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

import os
os.chdir('..')


In [2]:
import pandas as pd
from dataset_experiment.movielens100k import MovieLens100K
from dataset_experiment.lastfm import LastFM 

### MovieLens 100K dataset

In [3]:
ml_small = pd.read_csv("./datasets/ml-latest-small/ratings.csv")
ml_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print('''--- Dataset Raw Stats ---''')
print(f'''Number of users:\t {ml_small["userId"].nunique()}''')
print(f'''Number of items:\t {ml_small["movieId"].nunique()}''')
print(f'''Number of ratings:\t {ml_small.shape[0]}''')

--- Dataset Raw Stats ---
Number of users:	 610
Number of items:	 9724
Number of ratings:	 100836


In [5]:
p_ml_small = pd.read_csv("./datasets/ml-latest-small/ratings_processed.csv")
p_ml_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
print('''--- Processed Raw Stats ---''')
print(f'''Number of users:\t {p_ml_small["userId"].nunique()}''')
print(f'''Number of items:\t {p_ml_small["movieId"].nunique()}''')
print(f'''Number of ratings:\t {p_ml_small.shape[0]}''')

--- Processed Raw Stats ---
Number of users:	 610
Number of items:	 9517
Number of ratings:	 100521


In [7]:
print(f''' Percentage of Data Remained:\t {p_ml_small.shape[0] / ml_small.shape[0] * 100}''') 

 Percentage of Data Remained:	 99.6876115672974


In [8]:
kg_movies = pd.read_csv("./knowledge-graphs/props_wikidata_movielens_small.csv")
kg_movies

Unnamed: 0,movieId,title,prop,obj,imdbId
0,199,The Umbrellas of Cherbourg,director,Jacques Demy,tt0058450
1,199,The Umbrellas of Cherbourg,screenwriter,Jacques Demy,tt0058450
2,199,The Umbrellas of Cherbourg,composer,Michel Legrand,tt0058450
3,199,The Umbrellas of Cherbourg,genre,drama,tt0058450
4,199,The Umbrellas of Cherbourg,genre,musical film,tt0058450
...,...,...,...,...,...
295782,179053,2048: Nowhere to Run,cast member,Orion Ben,tt7387408
295783,179053,2048: Nowhere to Run,director of photography,Pierre Gill,tt7387408
295784,179053,2048: Nowhere to Run,country of origin,United States of America,tt7387408
295785,179053,2048: Nowhere to Run,narrative location,Los Angeles,tt7387408


In [9]:
print('''--- KG Stats ---''')
print(f'''Number of items:\t {kg_movies["movieId"].nunique()}''')
print(f'''Number of prop:\t\t {kg_movies["prop"].nunique()}''')
print(f'''Number of obj:\t\t {kg_movies["obj"].nunique()}''')
print(f'''N triplet:\t\t {kg_movies.shape[0]}''')

--- KG Stats ---
Number of items:	 9535
Number of prop:		 23
Number of obj:		 69487
N triplet:		 295787


In [10]:
print(f'''Percentage of items on KG:\t {kg_movies["movieId"].nunique()/ml_small["movieId"].nunique()}''')

Percentage of items on KG:	 0.9805635540929658


In [11]:
print(f'''Percentage of items on KG:\t {p_ml_small["movieId"].nunique()/kg_movies["movieId"].nunique()}''')

Percentage of items on KG:	 0.9981122181436812


In [12]:
print(f'''Percentage of items on KG:\t {p_ml_small["movieId"].nunique()} - {kg_movies["movieId"].nunique()}''')

Percentage of items on KG:	 9517 - 9535


In [13]:
ds = MovieLens100K(gen_dataset=False)
ds.load_fold(-1)

(<cornac.data.dataset.Dataset at 0x23b584c6fc0>,
 None,
 <cornac.data.dataset.Dataset at 0x23b591a69c0>)

In [14]:
ds.fold_percentage()

#### Fold -1 statistics ####
--- Training Raw Stats ---
Number of users:	 610
Number of items:	 8820
Number of ratings:	 80418
Average of rating:	 1.0

--- Test Raw Stats ---
Number of users:	 610
Number of items:	 5096
Number of ratings:	 20103
Average of rating:	 1.0

--- Dataset Percentage Stats ---
Training percentage:	 0.8000119378040409
Test percentage:		 0.19998806219595905
Full dataset used:		 1.0

--- Dataset Raw Stats ---
Total fold size:	 100521
Full Dataset size:	 100521



### LastFM Dataset

In [15]:
lastfm = pd.read_csv("./datasets/hetrec2011-lastfm-2k/user_artists.dat", sep='\t')
lastfm.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [16]:
print('''--- Dataset Raw Stats ---''')
print(f'''Number of users:\t {lastfm["userID"].nunique()}''')
print(f'''Number of items:\t {lastfm["artistID"].nunique()}''')
print(f'''Number of ratings:\t {lastfm.shape[0]}''')

--- Dataset Raw Stats ---
Number of users:	 1892
Number of items:	 17632
Number of ratings:	 92834


In [17]:
p_lastfm = pd.read_csv("./datasets/hetrec2011-lastfm-2k/ratings_processed.csv")
p_lastfm.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [18]:
print('''--- Processed Raw Stats ---''')
print(f'''Number of users:\t {p_lastfm["userID"].nunique()}''')
print(f'''Number of items:\t {p_lastfm["artistID"].nunique()}''')
print(f'''Number of ratings:\t {p_lastfm.shape[0]}''')

--- Processed Raw Stats ---
Number of users:	 1884
Number of items:	 11646
Number of ratings:	 83038


In [19]:
print(f''' Percentage of Data Remained:\t {p_lastfm.shape[0] / lastfm.shape[0] * 100}''') 

 Percentage of Data Remained:	 89.44783161341749


In [20]:
kg_artist = pd.read_csv("./knowledge-graphs/props_artists_id.csv")
kg_artist

Unnamed: 0,id,artist,prop,obj,wiki_id,name
0,1,Malice Mizer,work period (start),1992-01-01T00:00:00Z,Q853545,MALICE MIZER
1,1,Malice Mizer,has part,Mana,Q853545,MALICE MIZER
2,1,Malice Mizer,country of origin,Japan,Q853545,MALICE MIZER
3,1,Malice Mizer,record label,Nippon Columbia,Q853545,MALICE MIZER
4,1,Malice Mizer,record label,Columbia Records,Q853545,MALICE MIZER
...,...,...,...,...,...,...
134192,18517,V Factory,has part,Jared Murillo,Q3305177,V Factory
134193,18517,V Factory,country of origin,United States of America,Q3305177,V Factory
134194,18517,V Factory,record label,Warner Bros. Records,Q3305177,V Factory
134195,18517,V Factory,genre,rhythm and blues,Q3305177,V Factory


In [21]:
print('''--- KG Stats ---''')
print(f'''Number of items:\t {kg_artist["id"].nunique()}''')
print(f'''Number of prop:\t\t {kg_artist["prop"].nunique()}''')
print(f'''Number of obj:\t\t {kg_artist["obj"].nunique()}''')
print(f'''N triplet:\t\t {kg_artist.shape[0]}''')

--- KG Stats ---
Number of items:	 11646
Number of prop:		 33
Number of obj:		 23001
N triplet:		 134197


In [22]:
print(f'''Percentage of items on KG:\t {kg_artist["id"].nunique()/lastfm["artistID"].nunique()}''')

Percentage of items on KG:	 0.6605036297640653


In [23]:
ds = LastFM(gen_dataset=False)
ds.load_fold(-1)

(<cornac.data.dataset.Dataset at 0x23b556d7170>,
 None,
 <cornac.data.dataset.Dataset at 0x23b583bf350>)

In [24]:
ds.fold_percentage()

#### Fold -1 statistics ####
--- Training Raw Stats ---
Number of users:	 1884
Number of items:	 10395
Number of ratings:	 66357
Average of rating:	 1.0

--- Test Raw Stats ---
Number of users:	 1880
Number of items:	 4899
Number of ratings:	 16681
Average of rating:	 1.0

--- Dataset Percentage Stats ---
Training percentage:	 0.7991160673426624
Test percentage:		 0.20088393265733762
Full dataset used:		 1.0

--- Dataset Raw Stats ---
Total fold size:	 83038
Full Dataset size:	 83038

