# Scratchpad for recommenders

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [20]:
df = pd.read_csv("data/rating.csv")
df.shape

(20000263, 4)

In [5]:
df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24
20000262,138493,71619,2.5,2009-10-17 20:25:36


In [15]:
# is userId sequential? Yes
len(set(df.userId))

138493

In [14]:
# is movieId sequential? no there are holes
print(len(set(df.movieId)))

print(sorted(set(df.movieId))[12200:12220])

26744
[55757, 55765, 55768, 55782, 55805, 55814, 55820, 55826, 55830, 55844, 55851, 55854, 55856, 55872, 55875, 55888, 55895, 55901, 55908, 55926]


In [9]:
df.rating.value_counts()

4.0    5561926
3.0    4291193
5.0    2898660
3.5    2200156
4.5    1534824
2.0    1430997
2.5     883398
1.0     680732
1.5     279252
0.5     239125
Name: rating, dtype: int64

### Preprocess

In [21]:
# prepare preprocessed dataset

# drop axis
df.drop(["timestamp"], axis=1, inplace=True)

# make userId 0-indexed
df['userId'] = df['userId'] - 1

# map movieId to a new continoues movie ID
oldToNewMap = {}
oldIds = set(df.movieId.values)
newId = 0
for oldId in oldIds:
    oldToNewMap[oldId] = newId
    newId += 1
    
df["newMovieId"] = df.movieId.apply(lambda x: oldToNewMap[x])

In [22]:
df.head()

Unnamed: 0,userId,movieId,rating,newMovieId
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [35]:
df.to_csv('data/preprocessed_rating.csv', index=False)

### Make small dataset for on-premise workload

In [52]:
preprocessed = pd.read_csv("data/preprocessed_rating.csv")
preprocessed.head()

Unnamed: 0,userId,movieId,rating,newMovieId
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [53]:
userIdCount = Counter(preprocessed.userId)
movieIdCount = Counter(preprocessed.newMovieId)

# number of users and movies to keep
n = 10000
m = 2000

topUserIds = [k for k, v in userIdCount.most_common(n)]
topMovieIds = [k for k, v in movieIdCount.most_common(m)]

preprocessed_small = preprocessed[preprocessed.userId.isin(topUserIds) & preprocessed.movieId.isin(topMovieIds)].copy()

In [54]:
preprocessed_small.shape

(2206428, 4)

In [55]:
# ensure sequential
def make_sequential(df, col):
    old_to_new_map = {}
    old_ids = set(df[col])
    new_id = 0
    for old_id in old_ids:
        old_to_new_map[old_id] = new_id
        new_id += 1
    df[col] = df[col].apply(lambda x: old_to_new_map[x])
    return df

preprocessed_small = make_sequential(preprocessed_small, 'userId')
preprocessed_small = make_sequential(preprocessed_small, 'newMovieId')

In [56]:
preprocessed_small.reset_index(drop=True, inplace=True)

In [59]:
print(preprocessed_small.userId.max())
print(preprocessed_small.newMovieId.max())

9999
1705


In [60]:
preprocessed_small.to_csv('data/preprocessed_small_rating.csv', index=False)

In [1]:
# random stuff here