In [11]:
import torch
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict

In [19]:
#genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
NUM_GENRES = len(genres)
NUM_TAGS = 6
EMBEDDING_DIM = 50
INPUT_DIM = NUM_GENRES + (NUM_TAGS * EMBEDDING_DIM)

In [13]:
print("Loading GloVe embeddings...")
with open("../data/glove.twitter.27B/glove.twitter.27B.{}d.txt".format(EMBEDDING_DIM)) as glove_file:
    glove_embeddings = defaultdict(lambda: np.random.rand(EMBEDDING_DIM))
    i = 0
    for line in glove_file.readlines():
        values = line.split(" ")
        word = values[0]
        embedding = np.asarray(values[1:], dtype="float32")
        assert(len(embedding) == EMBEDDING_DIM)
        glove_embeddings[word] = embedding
        
        if i % 100000 == 0:
            print(i)
        i += 1
        
print("Done")

Loading GloVe embeddings...
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
Done


In [20]:
ratings = pd.read_pickle("../data/ml-1m-split/full.pkl")
movies = pd.read_csv("../data/ml-1m/movies.dat", sep="::", engine="python", header=None)
movies = movies.rename(columns={0: "movieId", 1: "title", 2: "genres"})
tags = pd.read_csv("../data/ml-20m/tags.csv")
tags = tags.drop(["userId", "timestamp"], axis=1)
tags["tag"] = tags["tag"].str.lower()
tags

Unnamed: 0,movieId,tag
0,4141,mark waters
1,208,dark hero
2,353,dark hero
3,521,noir thriller
4,592,dark hero
...,...,...
465559,55999,dragged
465560,55999,jason bateman
465561,55999,quirky
465562,55999,sad


In [21]:
item_map = ratings.drop(["user", "rating", "timestamp"], axis=1).drop_duplicates()
item_map = item_map.sort_values(by="item").reset_index().drop(["index", "item"], axis=1)
app_counts = tags["tag"].value_counts()
grouped = tags.groupby(["movieId", "tag"]).size().to_frame()

In [22]:
grouped[0][1].reset_index().rename(columns={0: "apps"})

Unnamed: 0,tag,apps
0,2009 reissue in stereoscopic 3-d,1
1,3d,3
2,55 movies every kid should see--entertainment ...,1
3,action figure,1
4,action figures,1
...,...,...
75,watched,1
76,witty,14
77,woody,1
78,ya boy,1


In [23]:
def get_item_input(item):
  item_id = int(item_map.iloc[item])
  if not isinstance(grouped[0][item_id], np.int64):
    item_tags = grouped[0][item_id].reset_index().rename(columns={0: "apps"})
    item_tags["total_apps"] = [app_counts[tag] for tag in item_tags["tag"]]
    item_tags = item_tags.sort_values(by=["apps", "total_apps"], ascending=False)
    item_top_tags = item_tags[:NUM_TAGS]["tag"].values
  else:
    item_top_tags = []

  if len(item_top_tags) < NUM_TAGS:
    item_genres_lower = [genre.lower() for genre in get_genres(item_id)]
    item_top_tags = np.concatenate([item_top_tags, item_genres_lower])[:NUM_TAGS]
    if len(item_top_tags) < NUM_TAGS:
      if len(item_genres_lower) >= 1:
        item_top_tags = np.concatenate([item_top_tags, np.tile(item_genres_lower, NUM_TAGS)])[:NUM_TAGS]
      else:
          item_top_tags = np.concatenate([item_top_tags, (["movie"] * NUM_TAGS)])[:NUM_TAGS]
        
        
  assert(len(item_top_tags) == NUM_TAGS)
  
  item_tag_input = np.concatenate([get_embedding(tag) for tag in item_top_tags])
  item_genre_input = get_genre_one_hot(item_id)
  
  item_input = np.concatenate([item_genre_input, item_tag_input])
  assert(len(item_input) == INPUT_DIM)
  return item_input
  
  
def get_embedding(tag):
  return np.mean(np.asarray([glove_embeddings[token] for token in tag.split(" ")]), axis=0)


def get_genres(item_id):
  item_genres = movies[movies["movieId"] == item_id]["genres"].values[0].split("|")
  if item_genres[0] == "(no genres listed)":
    return []
  return item_genres


def get_genre_one_hot(item_id):
  item_genres = get_genres(item_id)
  one_hot = np.zeros(NUM_GENRES)
  for (i, genre) in enumerate(item_genres):
    one_hot[genres.index(genre)] = 1
  return one_hot

In [24]:
get_item_input(0)

array([ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -5.05900025e-01,  4.77079988e-01,
       -1.98109999e-01,  1.09770000e+00, -1.47750005e-01, -2.62030005e-01,
        4.16810006e-01, -2.68099993e-01,  3.44519988e-02, -1.13680005e+00,
        6.66419983e-01,  4.97570008e-01, -1.82490003e+00,  1.51359999e+00,
        2.97789991e-01, -4.42880005e-01, -7.23899975e-02,  1.02699995e+00,
        5.46100020e-01,  7.25040019e-01,  3.26460004e-01,  7.31450021e-02,
        1.60990000e-01, -6.33880019e-01, -2.16670007e-01,  4.13520008e-01,
       -1.12720001e+00,  2.52779990e-01, -4.64439988e-02, -2.95249999e-01,
        9.49460030e-01,  3.33840013e-01, -4.53179985e-01, -5.86880028e-01,
       -8.30190003e-01,  

In [25]:
item_inputs = np.asarray([get_item_input(item) for item in range(0, item_map.shape[0])])

In [26]:
np.save("../data/item_inputs_1m.npy", item_inputs)