# Converting Netflix Prize user data to standard CSV format

# **THIS NOTEBOOK NEEDS AT LEAST 16GB OR RAM**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [3]:
ratings = pd.read_csv('data/combined_data_all.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

In [4]:
ratings

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
100498272,1790158,4.0
100498273,1608708,3.0
100498274,234275,1.0
100498275,255278,4.0


In [4]:
ratings.dtypes

Cust_Id     object
Rating     float64
dtype: object

In [5]:
%%time
#get indices of Movie ID rows
movies = pd.DataFrame(pd.isnull(ratings.Rating))
movies = movies[movies['Rating'] == True]
movies = movies.reset_index()
movie_rows = np.array(movies['index'], dtype = np.uint32)

#fast way to generate movie id column
new_rows = np.empty(len(ratings), dtype='uint16')
temp = 0
for i, j in enumerate(movie_rows):
    if (i+1) == len(movie_rows):
        new_rows[j:len(ratings)] = (i+1)
    new_rows[temp:j] = i
    temp = j

#merge Movie ID columns with original DF
#full_data = ratings
ratings['Movie_Id'] = new_rows
ratings.drop(index=movie_rows, inplace = True)

#convert data to unsigned int32 to save memory.
ratings = ratings.astype('uint16')

del new_rows 
del movies
del movie_rows

gc.collect()

ratings.info(verbose=False, memory_usage="deep")



<class 'pandas.core.frame.DataFrame'>
Int64Index: 100480507 entries, 1 to 100498276
Columns: 3 entries, Cust_Id to Movie_Id
dtypes: uint16(3)
memory usage: 1.3 GB
CPU times: user 15.9 s, sys: 4.66 s, total: 20.6 s
Wall time: 23.9 s


In [6]:
ratings

Unnamed: 0,Cust_Id,Rating,Movie_Id
1,47052,3,1
2,35677,5,1
3,33045,4,1
4,30878,4,1
5,37087,3,1
...,...,...,...
100498272,20686,4,17770
100498273,35844,3,17770
100498274,37667,1,17770
100498275,58670,4,17770


In [7]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))


                           ___:  7.6 GiB
                            _3:  7.6 GiB
                             _:  1.3 GiB
                       ratings:  1.3 GiB
                            _6:  1.3 GiB
                           _ii:  839.0 B
                           _i5:  839.0 B
                           _i7:  587.0 B
                            __:  459.0 B
                            _4:  459.0 B


garbage causing kernel crashes

In [8]:
ratings.to_csv('data/ratings.csv', index = False)