## Load, Filter, Split and Save datasets.
    This file loads the datasets from the specified path, splits the dataset manually to create 5 folds of a dataset and saves these 5 folds on the specified path on disk.
    In addition to creating folds, this notebook also filters the data based on users and items such that each user have rated atleast 5 items and each item have ratings from atleast 5 users.

In [1]:
import pandas as pd
import numpy as np
import os,sys

#### Load user reviews data for movies, games, tv and music and create a user-item matrix

In [3]:
data = pd.read_csv('data//user_reviews_by_movies.csv',delimiter='|',engine='python')

In [4]:
data.shape

(172187, 7)

In [65]:
movies = pd.DataFrame(data = data,columns=['user_id','token_name','user_rating'])
movies.shape

(172187, 3)

In [66]:
movies.columns = ['user_id','item_id','u_rating']
movies.head()

Unnamed: 0,user_id,item_id,u_rating
0,ilmi_,citizen-kane,10.0
1,GAMERGUYCASUAL,citizen-kane,6.0
2,Vladthepoker,citizen-kane,10.0
3,JoseAngel47,citizen-kane,0.0
4,Jalumbi,citizen-kane,9.0


In [105]:
games = pd.read_csv('data//user_reviews_by_games.csv',delimiter='|',encoding = 'utf8',engine='python')
games.shape

(897126, 7)

In [108]:
games_ui = games[['user_id','token_name','user_rating']]
games_ui.columns = ['user_id','item_id','u_rating']

In [109]:
games_ui.head()

Unnamed: 0,user_id,item_id,u_rating
0,SirCaestus,nintendo-64/the-legend-of-zelda-ocarina-of-time,10.0
1,Kaistlin,nintendo-64/the-legend-of-zelda-ocarina-of-time,10.0
2,Jacody,nintendo-64/the-legend-of-zelda-ocarina-of-time,10.0
3,doodlerman,nintendo-64/the-legend-of-zelda-ocarina-of-time,10.0
4,StevenA,nintendo-64/the-legend-of-zelda-ocarina-of-time,10.0


In [101]:
tv = pd.read_csv('data//user_reviews_by_tv.csv',delimiter = '|',encoding='utf8',engine='python')

In [102]:
tv.shape

(73041, 7)

In [110]:
tv_ui = tv[['user_id','token_name','user_rating']]
tv_ui.columns = ['user_id','item_id','u_rating']

In [111]:
tv_ui.head()

Unnamed: 0,user_id,item_id,u_rating
0,MrNobody19884,rectify/season-4,10.0
1,moviebuffers,rectify/season-4,10.0
2,dan547,rectify/season-4,10.0
3,toph123,rectify/season-4,10.0
4,JorgeLestre,rectify/season-4,10.0


In [96]:
music = pd.read_csv('data//user_reviews_by_music.csv',delimiter = '|')

In [97]:
music.shape

(121421, 7)

In [112]:
music_ui = music[['user_id','token_name','user_rating']]
music_ui.columns = ['user_id','item_id','u_rating']

In [114]:
music_ui.head()

Unnamed: 0,user_id,item_id,u_rating
0,kingofjersey,ten-freedom-summers/wadada-leo-smith,10.0
1,BENNEB,ten-freedom-summers/wadada-leo-smith,10.0
2,ScumbagHo,ten-freedom-summers/wadada-leo-smith,8.0
3,roinujrino,ten-freedom-summers/wadada-leo-smith,10.0
4,AnthonyL,smile/brian-wilson,10.0


#### Filter the data for users and items such that each user have reviewed atleast 5 items and each item have reviews from atleast 5 users. 

In [2]:
def filter_user_item_matrix(dataframe):
    """
    Function to filter the user-item dataframe with 5-cores i.e. each user have reviewed atleast 5 items and each item have atleast 5 user reviews.
    Arguments:
        dataframe: function takes a dataframe with user_id, item_id and user_rating info
    Return:
        A filtered dataframe.
    """
    while True:
        if dataframe.user_id.value_counts()[-1:][0] >5:
            #print(True)
            if dataframe.item_id.value_counts()[-1:][0] >5:
                #print(True)
                break
            else:
                #print(False)
                index = dataframe.index[dataframe.groupby('item_id')['user_id'].transform('count') > 5]
                dataframe = dataframe.loc[index,]
        else:
            #print(False)
            index = dataframe.index[dataframe.groupby('user_id')['item_id'].transform('count') > 5]
            dataframe = dataframe.loc[index,]
    return(dataframe)

In [121]:
movies = filter_user_item_matrix(movies)

In [122]:
movies.shape

(102152, 3)

In [116]:
tv_ui = filter_user_item_matrix(tv_ui)

False
True
False
False
True
False
False
True
False
False
True
False
False
True
False
False
True
False
False
True
True


In [120]:
tv_ui.shape

(9398, 3)

In [123]:
games_ui = filter_user_item_matrix(games_ui)

In [125]:
games_ui.shape

(191552, 3)

In [126]:
music_ui = filter_user_item_matrix(music_ui)

In [127]:
music_ui.shape

(32625, 3)

#### Create Train-Test split using 5 folds.
The train-test split is created manually assigning numbers 1-5 to each line. Then the rows with number 1 are seperated to create a test set and all other rows are kept as training set. This manual folds method is used ensure that we have atleast one entry of every user in the test set.
Notes: The current method of assigning a number to each row is very inefficient. I need to find an effective method for this function.

In [1]:
def assign_fold_number(df,nfolds):
    """
    Assigns a number to each row of dataset based on nfolds. This function assists in custom_train_test_split function.
    Arguments:
        df: dataframe
        nfolds: number of folds. The numbers are assigned from 1-n
    Returns:
        dataframe with fold number assigned to each row.
    """
    num =1
    for index in df.index:
        if num <=nfolds:
            df.loc[index,'n'] = num
            num+=1
        else:
            num=1
            df.loc[index,'n'] = num
            num+=1
    return(df)

In [4]:
def save_dataset_folds(df,file_name,nfolds=5):
    """
    This function saves the dataset folds on disk as csv file as per the fold number assigned in specified column.
    Arguments:
        df: dataframe whose folds to be saved
        file_name: name of the file on disk. files will be names as file_name1.csv, filename2.csv etc..
        nfolds: number of folds assigned in the dataset. It should be equal to the number of folds created in supplied dataset. If the number provided is less than the folds in dataset, the remaining data will be lost.
                Default nfolds=5
    Output:
        Creates csv files equal to nfolds value. The files are stored in [data] folder.
    """
    for n in range(1,nfolds+1):
        d1 = df[df['n']==n][['user_id','item_id','u_rating']]
        d1.to_csv('data//{}{}.csv'.format(df_name,n),index=False,sep = '|')

##### movies dataset

In [205]:
df = assign_fold_number(movies,nfolds=5)

In [206]:
save_dataset_folds(df,'movies',5)

##### games folds dataset

In [194]:
df = assign_fold_number(games_ui,nfolds=5)

In [200]:
save_dataset_folds(df,'games',5)

##### tv folds datasets

In [201]:
df = assign_fold_number(tv_ui,nfolds=5)

In [202]:
save_dataset_folds(df,'tv',5)

##### music folds datasets

In [203]:
df = assign_fold_number(music_ui,nfolds=5)

In [204]:
save_dataset_folds(df,'music',5)