In [1]:
import json
import gzip
import os
os.getcwd()
import pandas as pd
import datetime
import re 
import random 
import numpy as np

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from tqdm import tqdm
import heapq

from CrawlingMLMovieDescriptionFromTMDB import *

'''
------------------------------------------------------------------------------------
|Download the original dataset of MovieLens25M (ML) and AmazonMovie(AM): 
|    ML:  grouplens.org/datasets/movielens/25m/ 
|    AM:  jmcauley.ucsd.edu/data/amazon/
-------------------------------------------------------------------------------------
'''


AM_data_path = " "  # the AmazonMovie document path
ML_data_path = " "  # the MovieLens25M document path

save_file = "./Datasets/"

df_AmazonMovie_sub = pd.read_csv(AM_data_file + "AmazonMovieTitle.csv")
df_AmazonMovieRating = pd.read_csv(AM_data_file + "AmazonMoiveRating.csv")


rating_file_path = "ratings.csv"
df_ML = pd.read_csv(ML_data_path + "ratings.csv")
df_movies = pd.read_csv(ML_data_path + "movies.csv")

'''
------------------------------------------------------------------------------------
|Run the following code to crawl the textual descriptions for movies in MovieLens dataset. 
|This code will generate a dictionary file (Dict_MovieId2description.npy), key: movie_id -> value: movie_description

GetMLMovieDescription(ML_data_path, save_path)
MergeMLMovieDescription(file)

-------------------------------------------------------------------------------------
'''
dict_id2descripton = np.load(save_file + "Dict_MovieId2description.npy", allow_pickle=True).item()


In [2]:
def FilterDataframeByList(target_data, col, col_name):
    df_col = pd.DataFrame(col)
    df_col.columns = [col_name]
    output = pd.merge(target_data, df_col, how='inner', left_on=[col_name], right_on=[col_name])
    return output

def ToList(x):
    return list(x)

def Amazon_Data_Pareprocess(df_AmazonMovie_sub, df_AmazonMovieRating, MinNumOfRatingPerUser, MinNumOfRatingPerItem, MaxNumOfRatingPerUser, MaxNumOfRatingPerItem, start_time):
    
    df_AmazonMovieRating = df_AmazonMovieRating[["rating","account_id", "deal_id", "UnixTime"]]
    df_AmazonMovieRating["time"] = pd.to_datetime(df_AmazonMovieRating["UnixTime"], unit="s")
    df_AmazonMovieRating["time"] = df_AmazonMovieRating.time.dt.date
    df_AmazonMovieRating = df_AmazonMovieRating.loc[df_AmazonMovieRating.rating ==5]

    min_time = df_AmazonMovieRating.time.min()
    max_time = df_AmazonMovieRating.time.max()
    print(f"Time Zone: ({min_time}～{max_time})")
    
    avaluable_items = list(set(df_AmazonMovieRating.deal_id.unique()) & set(df_AmazonMovie_sub.deal_id.unique()))
    df_T = FilterDataframeByList(df_AmazonMovieRating, avaluable_items, "deal_id")
    
    df_AmazonMovieRating_sub = df_T.loc[df_T.time > start_time]

    filted_item_1 = df_AmazonMovieRating_sub.deal_id.value_counts().keys()[:(df_AmazonMovieRating_sub.deal_id.value_counts().values >= MinNumOfRatingPerItem).sum()]
    filted_item_2 = df_AmazonMovieRating_sub.deal_id.value_counts().keys()[:(df_AmazonMovieRating_sub.deal_id.value_counts().values <= MaxNumOfRatingPerItem).sum()]
    item_list = set(filted_item_1) & set(filted_item_2)
    df_AmazonMovieRating_sub = FilterDataframeByList(df_AmazonMovieRating_sub, item_list, "deal_id")

    filted_user_1 = df_AmazonMovieRating_sub.account_id.value_counts().keys()[:(df_AmazonMovieRating_sub.account_id.value_counts().values >= MinNumOfRatingPerUser).sum()]
    filted_user_2 = df_AmazonMovieRating_sub.account_id.value_counts().keys()[:(df_AmazonMovieRating_sub.account_id.value_counts().values <= MaxNumOfRatingPerUser).sum()]
    user_list = set(filted_user_1) & set(filted_user_2)
    df_AmazonMovieRating_sub = FilterDataframeByList(df_AmazonMovieRating_sub, user_list, "account_id")
    
    n_users = len(df_AmazonMovieRating_sub.account_id.unique())
    n_items = len(df_AmazonMovieRating_sub.deal_id.unique())
    n_interaction = df_AmazonMovieRating_sub.shape[0]
    print(f'user number: {n_users}')
    print(f'item number: {n_items}')
    print(f'interaction number: {n_interaction}')
    #df_AmazonMovieRating_sub.to_csv(data_file + "Dataset_AmazonMovie.csv", )
    
    
    df_AmazonMovie_filted = FilterDataframeByList(df_AmazonMovie_sub, df_AmazonMovieRating_sub.deal_id.unique(), "deal_id")
    import re 
    dr = re.compile(r'<[^>]+>|\\t|\\n|\\\'|\'|\[|\]', re.S)
    df_AmazonMovie_filted.description = df_AmazonMovie_filted.description.map(lambda x: dr.sub("",x))
    #df_AmazonMovie_filted.to_csv(data_file + "Dataset_MovieTitles.csv")
    
    return df_AmazonMovieRating_sub, df_AmazonMovie_filted
    
def ML_Data_Preprocess(is_target, df_ML, df_movies, dict_id2descripton):
    
    df_ML["time"] = pd.to_datetime(df_ML["timestamp"], unit="s")
    df_ML["time"] = df_ML.time.dt.date
    df_ML = df_ML.loc[df_ML.rating >= 5]
    df_ML = FilterDataframeByList(df_ML, dict_id2descripton.keys(), 'movieId')
    df_ML_sub = df_ML.loc[df_ML.time > datetime.date(2016,9,30)]
    df_ML_sub = df_ML_sub.loc[df_ML_sub.time < datetime.date(2018,10,1)]
    
    if is_target:
        df_ML_sub = FilterSourceData(df_ML_sub, "userId", "movieId", alpha=20, beta=5)
    
    n_users = len(df_ML_sub.userId.unique())
    n_items = len(df_ML_sub.movieId.unique())
    n_interaction = df_ML_sub.shape[0]
    print(f'user number: {n_users}')
    print(f'item number: {n_items}')
    print(f'interaction number: {n_interaction}')
    #df_ML_sub.to_csv(data_file + "./Dataset_MovieLens25M.csv")
    
    textual_file_path = "/ml-25m/movies.csv"
    df_movies = pd.read_csv(data_path + textual_file_path)
    df_movies = FilterDataframeByList(df_movies, df_ML_sub.movieId.unique(), 'movieId')
    #df_movies.to_csv(data_file + "Dataset_MLMovieTitles.csv")
    
    return df_ML_sub, df_movies

def FilterSourceData(df_ss, account_id, deal_id, alpha=20, beta=5):
    # alpha: filter out users who have num of interactions less than alpha. 
    # beta: the interactions have top beta tf-idf scores will be picked.
    
    dict_interactions_s = dict(df_ss.groupby(df_ss[account_id])[deal_id].apply(ToList))
    df_ss["deal_id_str"] = df_ss[deal_id].map(lambda x: str(x))
    dict_interactions_str = dict(df_ss.groupby(df_ss[account_id])["deal_id_str"].apply(ToList))
    
    data = np.array(list(dict_interactions_str.values()))
    dictionary = Dictionary(data)
    corpus = list(map(dictionary.doc2bow,data))
    model = TfidfModel(corpus)
    corpus_tfidf = model[corpus]
    
    tfidf_weight = []
    for i in tqdm(range(len(data))):
        tfidf_vec = []
        dict_id_2_tfidf = dict(zip([x[0] for x in corpus_tfidf[i]], [x[1] for x in corpus_tfidf[i]]))
        for token in dictionary.doc2idx(data[i]):
            try:
                tfidf_vec.append(dict_id_2_tfidf[token])
            except:
                tfidf_vec.append(0.)
        tfidf_weight.append(tfidf_vec)
        
    dict_int = {}
    item_lst = []
    user_lst = []
    user_list = list(dict_interactions_str.keys())
    for i in range(len(tfidf_weight)):
        a = np.array(tfidf_weight[i])
        if len(a)>=alpha:
            idx = heapq.nlargest(beta, range(len(a)), a.take)
            items = np.array(dict_interactions_s[user_list[i]])[idx]
        else:
            continue
            #items = np.array(dict_interactions_s[user_list[i]])
        dict_int[user_list[i]] = items
        item_lst.extend(items)
        user_lst.extend([user_list[i]] * len(items))    
    
    df_tmp = pd.DataFrame(user_lst)
    df_tmp.columns = ['userId']
    df_tmp['movieId'] = item_lst
    return df_tmp

In [9]:
Rating_AmzMasT, Side_AmzMasT = Amazon_Data_Preprocess(df_AmazonMovie_sub, df_AmazonMovieRating, 3, 5, 5, 15, datetime.date(2017,6,30))
Rating_AmzMasT.to_csv(save_file + "Rating_AmzMasT.csv")
Side_AmzMasT.to_csv(save_file + "Side_AmzMasT.csv")

Time Zone: (1997-09-03～2018-10-03)
user number: 8566
item number: 6752
interaction number: 39696


In [30]:
Rating_AmzMasS, Side_AmzMasS = Amazon_Data_Preprocess(df_AmazonMovie_sub, df_AmazonMovieRating, 3, 10, 10, 15, datetime.date(2016,9,30))
Rating_AmzMasS.to_csv(save_file + "Rating_AmzMasS.csv")
Side_AmzMasS.to_csv(save_file + "Side_AmzMasS.csv")

Time Zone: (1997-09-03～2018-10-03)
user number: 22046
item number: 7814
interaction number: 104216


In [47]:
Rating_MLasS, Side_MLasS = ML_Data_Preprocess(0)
Rating_MLasS.to_csv(save_file + "Rating_MLasS.csv")
Side_MLasS.to_csv(save_file + "Side_MLasS.csv")

user number: 18232
item number: 14435
interaction number: 421803


In [52]:
Rating_MLasT, Side_MLasT = ML_Data_Preprocess(1)
Rating_MLasT.to_csv(save_file + "Rating_MLasT.csv")
Side_MLasT.to_csv(save_file + "Side_MLasT.csv")

  
100%|██████████| 18232/18232 [00:03<00:00, 4758.73it/s]


user number: 6289
item number: 9873
interaction number: 31445
