# Getting Started with Case Recommender

## Import libraries

In [1]:
import pandas as pd

## Create and init files and vars

In [19]:
folder = "../dataset/ml-100k/"
dataset = "{}u.data".format(folder)

# Visualize file content
df = pd.read_csv(dataset, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [89]:
# Obtaing movie information

movies_info = "{}u.item".format(folder)
mv_df = pd.read_csv(movies_info, sep='|', header=None)
mv_df = mv_df[[0,1]]
mv_df.columns = ['item_id', 'title']

mv_df.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


## Divide dataset with fold-cross-validation

In [8]:
# Make the import
from caserec.utils.split_database import SplitDatabase

In [11]:
# input_file -> Input file with at least 2 columns (users and items)
# dir_folds  -> Directory to write folds (train and test files)
# n_splits   -> How much folds the strategy will divide

SplitDatabase(input_file=dataset, dir_folds=folder, n_splits=5).kfoldcrossvalidation()  

[(array([    0,     1,     2, ..., 99997, 99998, 99999]),
  array([    3,     7,    14, ..., 99985, 99986, 99992])),
 (array([    0,     2,     3, ..., 99996, 99998, 99999]),
  array([    1,    10,    16, ..., 99990, 99995, 99997])),
 (array([    1,     2,     3, ..., 99995, 99997, 99998]),
  array([    0,     9,    18, ..., 99994, 99996, 99999])),
 (array([    0,     1,     3, ..., 99996, 99997, 99999]),
  array([    2,     8,    13, ..., 99979, 99987, 99998])),
 (array([    0,     1,     2, ..., 99997, 99998, 99999]),
  array([    4,     5,     6, ..., 99974, 99980, 99993]))]

#### Generated folders

![title](../img/folds.PNG)

#### Inside each folder

![title](../img/in_fold.PNG)

## Generating rankings with Case Recommender

In this example, we will use the train and test sets in folder 0

#### Files to process

In [14]:
folder_0 = "{}/folds/0/".format(folder)
train = "{}train.dat".format(folder_0)
test = "{}test.dat".format(folder_0)

# File to write the predicted ranking
ranking = "{}ranking.dat".format(folder_0)

In [13]:
# Make the import
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

In [15]:
# train_file  -> File which contains the train set. 
# test_file   -> File which contains the test set.
# output_file -> File with dir to write the final predictions

ItemKNN(train_file=train, test_file=test, output_file=ranking).compute()

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 943 users and 1651 items (80000 interactions) | sparsity:: 94.86%
test data:: 940 users and 1421 items (20000 interactions) | sparsity:: 98.50%

training_time:: 3.778533 sec
prediction_time:: 43.226235 sec


Eval:: PREC@1: 0.47234 PREC@3: 0.414539 PREC@5: 0.383617 PREC@10: 0.32617 RECALL@1: 0.032983 RECALL@3: 0.081741 RECALL@5: 0.122883 RECALL@10: 0.205316 MAP@1: 0.47234 MAP@3: 0.57234 MAP@5: 0.56796 MAP@10: 0.533168 NDCG@1: 0.47234 NDCG@3: 0.659469 NDCG@5: 0.659881 NDCG@10: 0.648133 


#### Using value of K as 15, correlation as similarity measure and evaluation the results only with NDCG 

In [16]:
ItemKNN(train_file=train, test_file=test, output_file=ranking, k_neighbors=15, similarity_metric='correlation'
       ).compute(metrics=['NDCG'])

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 943 users and 1651 items (80000 interactions) | sparsity:: 94.86%
test data:: 940 users and 1421 items (20000 interactions) | sparsity:: 98.50%

training_time:: 4.330649 sec
prediction_time:: 36.896283 sec


Eval:: NDCG@1: 0.406383 NDCG@3: 0.606785 NDCG@5: 0.614104 NDCG@10: 0.609069 


### Visualization of the results

We will evaluate the ranking for user with id 438

#### Movies watched by user with id 438


In [65]:
# Marging information

train_df = pd.read_csv(train, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'time'])
train_df = train_df.merge(mv_df)
train_df_user1 = train_df[train_df['user_id'] == 438]
train_df_user1.reset_index(inplace=True, drop=True)
train_df_user1

Unnamed: 0,user_id,item_id,rating,time,title
0,438,21,2,879868683,Muppet Treasure Island (1996)
1,438,50,5,879868005,Star Wars (1977)
2,438,100,4,879868024,Fargo (1996)
3,438,121,5,879868328,Independence Day (ID4) (1996)
4,438,148,5,879868443,"Ghost and the Darkness, The (1996)"
5,438,181,4,879868005,Return of the Jedi (1983)
6,438,220,4,879868328,"Mirror Has Two Faces, The (1996)"
7,438,252,4,879868364,"Lost World: Jurassic Park, The (1997)"
8,438,255,4,879868242,My Best Friend's Wedding (1997)
9,438,257,4,879868159,Men in Black (1997)


#### Movies recommended for user with id 438

In [66]:
ranking_df = pd.read_csv(ranking, sep='\t', header=None, names=['user_id', 'item_id', 'score'])
ranking_df = ranking_df.merge(mv_df)
ranking_df_user1 = ranking_df[ranking_df['user_id'] == 438]
ranking_df_user1.reset_index(inplace=True, drop=True)
ranking_df_user1

Unnamed: 0,user_id,item_id,score,title
0,438,591,1.503576,Primal Fear (1996)
1,438,685,2.034708,Executive Decision (1996)
2,438,405,1.413552,Mission: Impossible (1996)
3,438,25,1.889224,"Birdcage, The (1996)"
4,438,471,1.635527,Courage Under Fire (1996)
5,438,125,2.346654,Phenomenon (1996)
6,438,148,1.449504,"Ghost and the Darkness, The (1996)"
7,438,151,1.544856,Willy Wonka and the Chocolate Factory (1971)
8,438,274,2.446495,Sabrina (1995)
9,438,111,2.900306,"Truth About Cats & Dogs, The (1996)"


## Generating predictions with Case Recommender

In this example, we will use the train and test sets in folder 1

#### Files to process

In [79]:
folder_1 = "{}/folds/1/".format(folder)
train = "{}train.dat".format(folder_1)
test = "{}test.dat".format(folder_1)

# File to write the predicted ranking
predictions = "{}predictions.dat".format(folder_1)

In [80]:
# Make the import
from caserec.recommenders.rating_prediction.itemknn import ItemKNN

In [88]:
# train_file  -> File which contains the train set. 
# test_file   -> File which contains the test set.
# output_file -> File with dir to write the final predictions

ItemKNN(train_file=train, test_file=test, output_file=predictions).compute()

[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 943 users and 1657 items (80000 interactions) | sparsity:: 94.88%
test data:: 941 users and 1388 items (20000 interactions) | sparsity:: 98.47%

training_time:: 3.539765 sec
prediction_time:: 5.079334 sec
Eval:: MAE: 0.808991 RMSE: 1.049884 


### Visualization of the results

We will evaluate the ranking for user with id 8

#### Hidden movies rated by user with id 8

In [86]:
# Marging information

test_df = pd.read_csv(test, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'time'])
test_df = test_df.merge(mv_df)

test_df_user8 = test_df[test_df['user_id'] == 8]
test_df_user8.reset_index(inplace=True, drop=True)
test_df_user8

Unnamed: 0,user_id,item_id,rating,time,title
0,8,227,4,879362423,Star Trek VI: The Undiscovered Country (1991)
1,8,127,5,879362123,"Godfather, The (1972)"
2,8,50,5,879362124,Star Wars (1977)
3,8,89,4,879362124,Blade Runner (1982)
4,8,11,3,879362233,Seven (Se7en) (1995)
5,8,82,5,879362356,Jurassic Park (1993)
6,8,172,5,879362123,"Empire Strikes Back, The (1980)"
7,8,174,5,879362183,Raiders of the Lost Ark (1981)
8,8,233,4,879362423,Under Siege (1992)
9,8,243,2,879361732,Jungle2Jungle (1997)


#### Rating predictions for the user 8 for the same movies

In [87]:
predictions_df = pd.read_csv(predictions, sep='\t', header=None, names=['user_id', 'item_id', 'score'])
predictions_df = predictions_df.merge(mv_df)

predictions_df_user8 = predictions_df[predictions_df['user_id'] == 8]
predictions_df_user8.reset_index(inplace=True, drop=True)
predictions_df_user8

Unnamed: 0,user_id,item_id,score,title
0,8,227,3.751772,Star Trek VI: The Undiscovered Country (1991)
1,8,127,4.590619,"Godfather, The (1972)"
2,8,50,4.824065,Star Wars (1977)
3,8,89,4.64299,Blade Runner (1982)
4,8,11,4.440896,Seven (Se7en) (1995)
5,8,82,4.230971,Jurassic Park (1993)
6,8,172,4.712517,"Empire Strikes Back, The (1980)"
7,8,174,4.754008,Raiders of the Lost Ark (1981)
8,8,233,3.740096,Under Siege (1992)
9,8,243,1.602553,Jungle2Jungle (1997)
