In [26]:
import numpy as np
import pandas as pd
import os
import random
import pickle
from collections import defaultdict

In [27]:
def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)


def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        
def check_k_core(dataframe, user_core, item_core):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user in dataframe['uid'].tolist():
        user_count[user] += 1
    for item in dataframe['iid'].tolist():
        item_count[item] += 1
    for user in user_count:
        if user_count[user] < user_core:
            return user_count, item_count, False
        
    for item in item_count:
        if item_count[item] < item_core:
            return user_count, item_count, False
        
    return user_count, item_count, True

def delete_users_from_df(dataframe, user_list):
    dataframe = dataframe.drop(dataframe[dataframe['uid'].isin(user_list)].index).reset_index(drop=True)
    return dataframe

def delete_items_from_df(dataframe, item_list):
    dataframe = dataframe.drop(dataframe[dataframe['iid'].isin(item_list)].index).reset_index(drop=True)
    return dataframe

def filter_k_core(dataframe, user_core, item_core):
    user_count, item_count, isKcore = check_k_core(dataframe, user_core, item_core)
    while not isKcore:
        delete_user_list = [user for user in user_count if user_count[user] < user_core]
        delete_item_list = [item for item in item_count if item_count[item] < item_core]
        dataframe = delete_users_from_df(dataframe, delete_user_list)
        dataframe = delete_items_from_df(dataframe, delete_item_list)
        
        user_count, item_count, isKcore = check_k_core(dataframe, user_core, item_core)
    return dataframe

In [30]:
def get_data_name(raw_data_name):
    if raw_data_name == 'ml-100k':
        return 'ML100K'
    elif raw_data_name == 'ml-1m':
        return 'ML1M'
    elif raw_data_name == 'ml-20m':
        return 'ML20M'
    else:
        raise NotImplementedError
        
def load_data(raw_data_folder, raw_data_name, user_core, item_core):
    if raw_data_name == 'ml-1m':
        raw_data_path = os.path.join(raw_data_folder, raw_data_name, 'ratings.dat')
        sep = "::"
        skipr = 0
    elif raw_data_name == 'ml-20m':
        raw_data_path = os.path.join(raw_data_folder, raw_data_name, 'ratings.csv')
        sep = ','
        skipr = 1
    elif raw_data_name == 'ml-100k':
        raw_data_path = os.path.join(raw_data_folder, raw_data_name, 'u.data')
        sep = "\t"
        skipr = 0
    else:
        raise NotImplementedError
    if not os.path.exists(raw_data_path):
        raise FileNotFoundError
    name_cols = ['uid', 'iid', 'rating', 'timestamp'] 
    print(f'load data from {raw_data_name} dataset')
    df = pd.read_csv(raw_data_path, names = name_cols, sep=sep, skiprows=skipr)
    print(f'apply user {user_core} core and item {item_core} core filters')
    df = filter_k_core(df, user_core, item_core)
    df = df.sort_values('timestamp')
    users = df['uid'].tolist()
    items = df['iid'].tolist()
    user_num = len(df['uid'].unique())
    item_num = len(df['iid'].unique())
    print(f'The {raw_data_name} dataset has {len(users)} data with {user_num} users and {item_num} items')
    assert len(users) == len(items)
    user_sequence = defaultdict(list)
    for i in range(len(users)):
        user = str(users[i])
        item = str(items[i])
        user_sequence[user].append(item)
    return user_sequence

def write_sequence_into_file(data_path, raw_data_name, user_sequence):
    data_name = get_data_name(raw_data_name)
    data_folder = os.path.join(data_path,data_name)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data_file = os.path.join(data_folder, 'user_sequence.txt')
    with open(data_file, 'w') as out:
        for user, items in user_sequence.items():
            out.write(user + ' ' + ' '.join(items) + '\n')

In [31]:
raw_data_folder = '../raw_data/MovieLens/'
raw_data_name = 'ml-1m'
data_path = '../data/'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from ml-1m dataset


  df = pd.read_csv(raw_data_path, names = name_cols, sep=sep, skiprows=skipr)


apply user 5 core and item 5 core filters
The ml-1m dataset has 999611 data with 6040 users and 3416 items


In [32]:
raw_data_folder = '../raw_data/MovieLens/'
raw_data_name = 'ml-100k'
data_path = '../data/'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from ml-100k dataset
apply user 5 core and item 5 core filters
The ml-100k dataset has 99287 data with 943 users and 1349 items


In [35]:
raw_data_folder = '../raw_data/MovieLens/'
raw_data_name = 'ml-20m'
data_path = '../data/'
user_sequence = load_data(raw_data_folder, raw_data_name, 5, 5)
write_sequence_into_file(data_path, raw_data_name, user_sequence)

load data from ml-20m dataset
apply user 5 core and item 5 core filters
The ml-20m dataset has 19984024 data with 138493 users and 18345 items


In [34]:
raw_data_folder = '../raw_data/MovieLens/'
raw_data_name = 'ml-100k'
raw_data_path = os.path.join(raw_data_folder, raw_data_name, 'u.data')
name_cols = ['uid', 'iid', 'rating', 'timestamp'] 
df = pd.read_csv(raw_data_path, names = name_cols, skiprows=0, sep="\t")
df = filter_k_core(df, 5, 5)
len(df['uid'].tolist())

99287