In [1]:
import numpy as np
import pandas as pd 

### Exploring MovieLens Dataset

In [2]:
ml_small = pd.read_csv("../datasets/ml-latest-small/ratings.csv")
ml_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ml_movies = pd.read_csv("../datasets/ml-latest-small/movies.csv")
ml_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
kg_movies = pd.read_csv("../knowledge-graphs/props_wikidata_movielens_small.csv")
kg_movies.head()

Unnamed: 0,movieId,title,prop,obj,imdbId
0,199,The Umbrellas of Cherbourg,director,Jacques Demy,tt0058450
1,199,The Umbrellas of Cherbourg,screenwriter,Jacques Demy,tt0058450
2,199,The Umbrellas of Cherbourg,composer,Michel Legrand,tt0058450
3,199,The Umbrellas of Cherbourg,genre,drama,tt0058450
4,199,The Umbrellas of Cherbourg,genre,musical film,tt0058450


In [5]:
# number of unique movies that are on movie interactions
movies_id = ml_small["movieId"].unique()
print(len(movies_id))

9724


In [6]:
# number of movies in listed in the item file
all_movies_id = ml_movies["movieId"].unique()
print(len(all_movies_id))

9742


In [7]:
# knowledge graph movies 
kg_movies_id = kg_movies["movieId"].unique()
print(len(kg_movies_id))

9535


In [8]:
# check intersection between all movies and extracted from the kg
intersection = set(all_movies_id).intersection(set(kg_movies_id))
list(intersection)[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [9]:
print(len(intersection))

9535


In [10]:
# checking if there are movies in the kg that are not in the set of all movies
list(set(kg_movies_id).difference(set(all_movies_id)))

[]

In [11]:
# checking the oposite, if there are movies on the set of all movies but not in the kg
len(list(set(all_movies_id).difference(set(kg_movies_id))))

207

In [12]:
# checking union, should be equal to total movies
union = set(all_movies_id).union(set(kg_movies_id))
len(list(union))

9742

In [13]:
# checking the percentage of items covered in the kg
len(list(intersection)) / len(list(union))

0.9787517963457195

In [14]:
ml_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Keeping in the dataset only movies contained in the KG

In [15]:
processed = ml_small.loc[ml_small["movieId"].isin(list(intersection))]
processed.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [16]:
processed.shape

(100521, 4)

In [17]:
ml_small.shape

(100836, 4)

In [18]:
# We were able to keep 99% of interactions even thought we covered 97% of movies
processed.shape[0]/ml_small.shape[0]

0.996876115672974

In [19]:
# saving new dataset
processed.to_csv("../datasets/ml-latest-small/ratings_processed.csv", header=True, index=False)

In [20]:
# reading new dataset 
r_processed = pd.read_csv("../datasets/ml-latest-small/ratings_processed.csv")
print(r_processed.shape[0]/ml_small.shape[0])

0.996876115672974


In [21]:
r_processed.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [22]:
r_processed.shape

(100521, 4)

### Spliting Data with a KFold Temporal User Approach
In this method we first need to sort the interactions by timestamp and then split the dataset into training, validation and testing. However, if we are also doing a K-Fold this means that we roughly divide the dataset into 6 parts, initially the first one is for trainig and the other is divided into validation and testing. The second fold has three parts: the first two from the first fold and another. In this second fold the two initial parts are for training and the thrid for validation and testing. In the third, it has four parts, the first three from the previous fold with the three initial for training and the last divided into validation and testing and so it goes... 

In [23]:
users = r_processed["userId"].unique()
users[:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [24]:
from sklearn.model_selection import TimeSeriesSplit

n_splits = 5
train_folds = [pd.DataFrame() for _ in range(n_splits)]
validation_folds = [pd.DataFrame() for _ in range(n_splits)]
test_folds = [pd.DataFrame() for _ in range(n_splits)]

timesplit = TimeSeriesSplit(n_splits=n_splits)

In [25]:
from tqdm import tqdm
for user in tqdm(users):
    user_ds = r_processed[r_processed["userId"] == user].sort_values("timestamp")
    for i, (train_index, test_index) in enumerate(timesplit.split(user_ds)):
        train_folds[i] = pd.concat([train_folds[i], r_processed.loc[user_ds.iloc[train_index].index]])
        
        test_val_df = r_processed.loc[user_ds.iloc[test_index].index]
        half = int(test_val_df.shape[0] / 2)
        validation_df = test_val_df.iloc[:half,:]
        test_df = test_val_df.iloc[half:,:]

        validation_folds[i] = pd.concat([validation_folds[i], validation_df])  
        test_folds[i] = pd.concat([test_folds[i], test_df])

100%|██████████| 610/610 [00:07<00:00, 83.88it/s]


In [26]:
train_size = train_folds[4].shape[0]
val_size = validation_folds[4].shape[0]
test_size = test_folds[4].shape[0]
sum_size = train_size + val_size + test_size

print(f'''Train size: {train_size}''')
print(f'''Validation size: {val_size}''')
print(f'''Test size: {test_size}''')
print(f'''Sum size: {sum_size}''') 
print(f'''Full Dataframe Size: {r_processed.shape[0]}''')

Train size: 84032
Validation size: 8078
Test size: 8411
Sum size: 100521
Full Dataframe Size: 100521


In [27]:
sum_size == r_processed.shape[0]


True

In [28]:
print(train_size / sum_size)
print(val_size / sum_size)
print(test_size / sum_size)

0.8359646243073586
0.08036131753563931
0.08367405815700202


In [29]:
r_processed.sort_index()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100516,610,166534,4.0,1493848402
100517,610,168248,5.0,1493850091
100518,610,168250,5.0,1494273047
100519,610,168252,5.0,1493846352


In [30]:
pd.concat([train_folds[4], validation_folds[4], test_folds[4]]).sort_index()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100516,610,166534,4.0,1493848402
100517,610,168248,5.0,1493850091
100518,610,168250,5.0,1494273047
100519,610,168252,5.0,1493846352


In [31]:
diff = r_processed.sort_index().compare(pd.concat([train_folds[4], validation_folds[4], test_folds[4]]).sort_index())
diff.head()

In [32]:
def split_percentage(i: int, full_df: pd.DataFrame) -> None:
    trn_s = train_folds[i].shape[0]
    val_s = validation_folds[i].shape[0]
    tst_s = test_folds[i].shape[0]
    total = trn_s + val_s + tst_s
    full = full_df.shape[0]

    print('''--- Dataset Raw Stats ---''')
    print(f'''Train size:\t\t {trn_s}''')
    print(f'''Validation size:\t {val_s}''')
    print(f'''Test size:\t\t {tst_s}''')
    print(f'''Total fold size:\t {total}''')
    print(f'''Full Dataset size:\t {full}''')
    print()

    print('''--- Dataset Percentage Stats ---''')
    print(f'''Training percentage:\t {trn_s/total}''')
    print(f'''Validation percentage:\t {val_s/total}''')
    print(f'''Test percentage:\t {tst_s/total}''')
    print(f'''Dataset used:\t\t {total/full}''')

In [33]:
split_percentage(0, r_processed)

--- Dataset Raw Stats ---
Train size:		 18076
Validation size:	 8078
Test size:		 8411
Total fold size:	 34565
Full Dataset size:	 100521

--- Dataset Percentage Stats ---
Training percentage:	 0.5229567481556487
Validation percentage:	 0.23370461449443078
Test percentage:	 0.24333863734992045
Dataset used:		 0.3438584972294346


In [34]:
split_percentage(1, r_processed)

--- Dataset Raw Stats ---
Train size:		 34565
Validation size:	 8078
Test size:		 8411
Total fold size:	 51054
Full Dataset size:	 100521

--- Dataset Percentage Stats ---
Training percentage:	 0.6770282446037529
Validation percentage:	 0.15822462490696126
Test percentage:	 0.16474713048928585
Dataset used:		 0.507893872922076


In [35]:
split_percentage(2, r_processed)

--- Dataset Raw Stats ---
Train size:		 51054
Validation size:	 8078
Test size:		 8411
Total fold size:	 67543
Full Dataset size:	 100521

--- Dataset Percentage Stats ---
Training percentage:	 0.7558740357994167
Validation percentage:	 0.1195978857912737
Test percentage:	 0.12452807840930963
Dataset used:		 0.6719292486147174


In [36]:
split_percentage(3, r_processed)

--- Dataset Raw Stats ---
Train size:		 67543
Validation size:	 8078
Test size:		 8411
Total fold size:	 84032
Full Dataset size:	 100521

--- Dataset Percentage Stats ---
Training percentage:	 0.8037771325209444
Validation percentage:	 0.09613004569687739
Test percentage:	 0.10009282178217822
Dataset used:		 0.8359646243073586


In [37]:
split_percentage(4, r_processed)

--- Dataset Raw Stats ---
Train size:		 84032
Validation size:	 8078
Test size:		 8411
Total fold size:	 100521
Full Dataset size:	 100521

--- Dataset Percentage Stats ---
Training percentage:	 0.8359646243073586
Validation percentage:	 0.08036131753563931
Test percentage:	 0.08367405815700202
Dataset used:		 1.0


### Saving each fold

In [38]:
import os

os.mkdir("../datasets/ml-latest-small/folds/")
for f in range(0, n_splits):
    fold_name = "../datasets/ml-latest-small/folds/" + str(f)
    os.mkdir(fold_name)
    train_folds[f].to_csv(fold_name + "/train.csv", header=True, index=False)
    validation_folds[f].to_csv(fold_name + "/validation.csv", header=True, index=False)
    test_folds[f].to_csv(fold_name + "/test.csv", header=True, index=False)