In [1]:
import os
import csv
import json
import pandas as pd

In [2]:
!pwd

/home/nbuser/netflix_recommendation


In [3]:
!ls data

combined_data_1.txt  movie_titles.csv	     netflix_val.csv
combined_data_2.txt  netflix-prize-data.zip  probe.txt
combined_data_3.txt  netflix_ratings.csv     qualifying.txt
combined_data_4.txt  netflix_train.csv	     README


In [4]:
#! cat data/README

In [5]:
! head -3 data/combined_data_1.txt

1:
1488844,3,2005-09-06
822109,5,2005-05-13


### Function to Process Netflix Movie Rating Data

In [17]:
def process_netflix(in_path, out_path, files):
    '''
    process raw movie rating file
    output line should be in the format of:
    [user_id, rating, date, movie_id]
    '''
    train_path = os.path.join(out_path, 'netflix_train_encoded.csv')
    val_path = os.path.join(out_path, 'netflix_val.csv')
    
    user_path = os.path.join(out_path, 'user_dict.txt')
    movie_path = os.path.join(out_path, 'movie_dict.txt')
    
    user_dict = {}    # map user_id to continious index, (k,v) -> (user_id, user_index)
    movie_dict = {}   # map movie_id to continious index, (k,v) -> (movie_id, user_index)
    
    #with open(output_file, "w") as output:
    with open(train_path, "w") as train, open(val_path, "w") as val,\
         open(user_path, "w") as user, open(movie_path, "w") as movie:
        
        writer1 = csv.writer(train, lineterminator='\n')
        writer1.writerow(['user_id','rating','date', 'movie_id'])
        train_cnt = 0
        
        writer2 = csv.writer(val, lineterminator='\n')
        writer2.writerow(['user_id','rating','date', 'movie_id'])
        val_cnt = 0
        
        writer3 = csv.writer(user, lineterminator='\n')
        writer3.writerow(['user_id','idx'])
        current_user_idx = 0
        
        writer4 = csv.writer(movie, lineterminator='\n')
        writer4.writerow(['movie_id','idx'])
        current_movie_idx = 0
        
        for file in files: # iterate through 4 files
            
            print("processing file {}...".format(file))
            file_path = os.path.join(in_path, file)
            movie_id = 0   # initilize movie_id
            
            with open(file_path) as f:
                for line in f:
                    if len(line.split(',')) == 1:    # identify movie_id line
                        movie_id = line.strip(':\n')     
                    elif len(line.split(',')) == 3:  # identify rating line
                        user_id, rating, date = line.strip().split(',')  
                        
                        # use data before 2015-08-01 as train and after as validation
                        if date < '2005-08-01': 
                                
                            user_idx = user_dict.get(user_id, -1)
                            if user_idx < 0:
                                user_idx = current_user_idx
                                user_dict[user_id] = current_user_idx
                                writer3.writerow([user_id, user_idx])
                                current_user_idx += 1
                                
                            movie_idx = movie_dict.get(movie_id, -1)
                            if movie_idx < 0:
                                movie_idx = current_movie_idx
                                movie_dict[movie_id] = current_movie_idx
                                writer4.writerow([movie_id, movie_idx])
                                current_movie_idx += 1
                            
                            writer1.writerow([user_idx, movie_idx, rating, date]) # write to train
                            train_cnt += 1
                        else:
                            writer2.writerow([user_id, movie_id, rating, date]) # write to validation
                            val_cnt += 1
                    else:
                        print("failed to parse line: {}".format(line.strip())) # print illeagle line
                        
        print("finished, file output to {} and {}".format(train_path, val_path))
        print("{} in train, {} in validation".format(train_cnt, val_cnt))
        print("{} unique user, {} unique movie".format(current_user_idx, current_movie_idx))
        


### Test on Small Dataset

In [41]:
! head data/combined_data_1.txt > ./test/test.txt

In [42]:
! cat ./test/test.txt

1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26
823519,3,2004-05-03
893988,3,2005-11-17
124105,4,2004-08-05
1248029,3,2004-04-22
1842128,4,2004-05-09


In [43]:
process_netflix(in_path="./test", out_path="./test", files=['test.txt'])

processing file test.txt...
finished, file output to ./test/netflix_train_encoded.csv and ./test/netflix_val.csv
5 in train, 4 in validation
5 unique user, 1 unique movie


### New heading

In [44]:
! cat ./test/netflix_train_encoded.csv

user_id,movie_id,rating,date
0,0,5,2005-05-13
1,0,3,2004-05-03
2,0,4,2004-08-05
3,0,3,2004-04-22
4,0,4,2004-05-09


In [45]:
! cat ./test/user_dict.txt

user_id,idx
822109,0
823519,1
124105,2
1248029,3
1842128,4


### Process Whole Data

In [46]:
files = ['combined_data_1.txt', 'combined_data_2.txt', 
         'combined_data_3.txt', 'combined_data_4.txt']

process_netflix(in_path="./data", out_path="./processed", files=files)

processing file combined_data_1.txt...
processing file combined_data_2.txt...
processing file combined_data_3.txt...
processing file combined_data_4.txt...
finished, file output to ./processed/netflix_train_encoded.csv and ./processed/netflix_val.csv
79137937 in train, 21342570 in validation
400267 unique user, 17375 unique movie


In [47]:
! head ./processed/netflix_train_encoded.csv

user_id,movie_id,rating,date
0,0,5,2005-05-13
1,0,3,2004-05-03
2,0,4,2004-08-05
3,0,3,2004-04-22
4,0,4,2004-05-09
5,0,3,2005-05-11
6,0,4,2005-05-19
7,0,5,2005-06-06
8,0,3,2004-08-12
