# Environment

In [1]:
# version: Python 3.8.5
# requirements.txt

# Imports and settings

In [2]:
import glob
import pandas as pd
import numpy as np
import pickle as pkl
import asyncio
from scipy.sparse import csr_matrix

path = './download/'

qf, _, _, mt, _, ts, pb = glob.glob(path+'*')
qf, mt, ts, pb

('./download/qualifying.txt',
 './download/movie_titles.txt',
 './download/training_set',
 './download/probe.txt')

# Data cleaning

In [3]:
def addMovieIDColumn(df):
    movieId = df[df.CustomerID.str.contains(':')].copy()
    lastRowIndex = df.shape[0]
    movieIdStart = movieId.index.to_list()
    movieIdEnd = movieIdStart[1:]+[lastRowIndex]

    movieIdRepeat = []
    for s, e in zip(movieIdStart,movieIdEnd):
        mid = int(movieId.loc[s,'CustomerID'].replace(':',''))
        movieIdRepeat += [mid]*(e-s)
    else:
        df['MovieID'] = pd.Series(movieIdRepeat)
        df_ = df[~df.CustomerID.str.contains(':')].copy()
        
    return df_, movieId

### qualifying.txt

In [4]:
dfqf = pd.read_csv(qf, header = None, names = ['CustomerID', 'Date'], usecols = [0,1])
dfqf_, movieId = addMovieIDColumn(dfqf)
dfqf_.reset_index(drop=True, inplace=True)
movieId.reset_index(drop=True, inplace=True)
pkl.dump(dfqf_, open(f'./pkl/qualifying_.pkl', 'wb')) # for prediction
pkl.dump(movieId, open(f'./pkl/qualifyingMovieId.pkl', 'wb')) # cache movieId info
dfqf_

Unnamed: 0,CustomerID,Date,MovieID
0,1046323,2005-12-19,1
1,1080030,2005-12-23,1
2,1830096,2005-03-14,1
3,368059,2005-05-26,1
4,802003,2005-11-07,1
...,...,...,...
2817126,1521720,2005-12-07,9998
2817127,1363704,2005-10-01,9998
2817128,2153561,2005-11-15,9999
2817129,1490137,2005-12-22,9999


### probe.txt

In [5]:
dfpb = pd.read_csv(pb, header = None, names = ['CustomerID'], usecols = [0])
dfpb_, movieId = addMovieIDColumn(dfpb)
dfpb_.reset_index(drop=True, inplace=True)
movieId.reset_index(drop=True, inplace=True)
pkl.dump(dfqf_, open(f'./pkl/probe_.pkl', 'wb')) # for prediction
pkl.dump(movieId, open(f'./pkl/probeMovieId.pkl', 'wb')) # cache movieId info
dfpb_

Unnamed: 0,CustomerID,MovieID
0,30878,1
1,2647871,1
2,1283744,1
3,2488120,1
4,317050,1
...,...,...
1408390,2328701,9997
1408391,1288730,9998
1408392,2536567,9998
1408393,1107317,9998


### movie_titles.txt

In [6]:
dfmt = pd.read_csv(mt, header = None, 
                   names = ['MovieID', 'YearOfRelease', 'Title'], 
                   usecols = [0,1,2], 
                   encoding = "ISO-8859-1")
dfmt.Title = dfmt.Title.apply(lambda x : x.replace('\n',''))
dfmt[dfmt.YearOfRelease.isna()]

Unnamed: 0,MovieID,YearOfRelease,Title
4387,4388,,Ancient Civilizations: Rome and Pompeii
4793,4794,,Ancient Civilizations: Land of the Pharaohs
7240,7241,,Ancient Civilizations: Athens and Greece
10781,10782,,Roti Kapada Aur Makaan
15917,15918,,Hote Hote Pyaar Ho Gaya
16677,16678,,Jimmy Hollywood
17666,17667,,Eros Dance Dhamaka


In [7]:
_ = dfmt[dfmt.YearOfRelease.isna()].index
dfmt.loc[_, 'YearOfRelease'] = (2001, 2001, 2001, 1974, 1999, 1994, 1999) # hand fix
dfmt.YearOfRelease.astype('int', copy=True).astype('str')
dfmt.YearOfRelease = dfmt.YearOfRelease.apply(lambda x: pd.to_datetime(x, format='%Y', errors='ignore'))
pkl.dump(dfmt, open(f'./pkl/movie_titles.pkl', 'wb'))
dfmt

Unnamed: 0,MovieID,YearOfRelease,Title
0,1,2003-01-01,Dinosaur Planet
1,2,2004-01-01,Isle of Man TT 2004 Review
2,3,1997-01-01,Character
3,4,1994-01-01,Paula Abdul's Get Up & Dance
4,5,2004-01-01,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002-01-01,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004-01-01,Fidel Castro: American Experience
17767,17768,2000-01-01,Epoch
17768,17769,2003-01-01,The Company


# training_set

In [8]:
# procedural process (2552.06 s)
tsFiles = glob.glob(ts+'/*')
tsFiles.sort()

dfts_ = []
movieId = []
for _ in tsFiles:
    dftsi = pd.read_csv(_, header = None, names = ['CustomerID', 'Rating', 'Date'], usecols = [0, 1, 2])
    dftsi_, movieIdi = addMovieIDColumn(dftsi)
    dfts_.append(dftsi_)
    movieId.append(movieIdi)
else:
    dfts_ = pd.concat(dfts_).reset_index(drop=True)
    movieId = pd.concat(movieId).reset_index(drop=True)
    dfts_.CustomerID = dfts_.CustomerID.astype('int32')
    dfts_.MovieID = dfts_.MovieID.astype('int32')
    dfts_.Rating = dfts_.Rating.astype('int32')
    
    pkl.dump(dfts_, open(f'./pkl/trainingSet_.pkl', 'wb')) # for prediction
    pkl.dump(movieId, open(f'./pkl/trainingSetMovieId.pkl', 'wb')) # cache movieId info
dfts_ 

Unnamed: 0,CustomerID,Rating,Date,MovieID
0,1488844,3,2005-09-06,1
1,822109,5,2005-05-13,1
2,885013,4,2005-10-19,1
3,30878,4,2005-12-26,1
4,823519,3,2004-05-03,1
...,...,...,...,...
100480502,1790158,4,2005-11-01,17770
100480503,1608708,3,2005-07-19,17770
100480504,234275,1,2004-08-07,17770
100480505,255278,4,2004-05-28,17770


In [20]:
# multiprocess process (3586.15 s)(not improved)
from multiprocessing import Process, Queue
import os
tsFiles = glob.glob(ts+'/*')
tsFiles.sort()

cpus = os.cpu_count()

def part(f, Qdft_, QmovieId):
    dftsi = pd.read_csv(_, header = None, names = ['CustomerID', 'Rating', 'Date'], usecols = [0, 1, 2])
    dftsi_, movieIdi = addMovieIDColumn(dftsi)
    Qdft_.put(dftsi_)
    QmovieId.put(movieIdi)


Qdft_ = Queue()
QmovieId = Queue()
jobs = []

dfts_ = []
movieId = []
for _ in tsFiles:
    p = Process(target=part, args=(_, Qdft_, QmovieId))
    jobs.append(p)
    p.start()
    dfts_.append(Qdft_.get())
    movieId.append(QmovieId.get())
    
    if len(jobs) >= cpus:
        [job.join() for job in jobs] # block until every job finish
        jobs = []
else:
    [job.join() for job in jobs] # block until every job finish
    dfts_ = pd.concat(dfts_).reset_index(drop=True)
    movieId = pd.concat(movieId).reset_index(drop=True)
    pkl.dump(dfts_, open(f'./pkl/trainingSet_.pkl', 'wb')) # for prediction
    pkl.dump(movieId, open(f'./pkl/trainingSetMovieId.pkl', 'wb')) # cache movieId info
dfts_

Unnamed: 0,CustomerID,Rating,Date,MovieID
0,1488844,3.0,2005-09-06,1
1,822109,5.0,2005-05-13,1
2,885013,4.0,2005-10-19,1
3,30878,4.0,2005-12-26,1
4,823519,3.0,2004-05-03,1
...,...,...,...,...
100480502,1790158,4.0,2005-11-01,17770
100480503,1608708,3.0,2005-07-19,17770
100480504,234275,1.0,2004-08-07,17770
100480505,255278,4.0,2004-05-28,17770
