In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from importlib import import_module
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Load Data

In [2]:
data_path = '/opt/ml/movie-recommendation/data/train/'

data = pd.read_csv(os.path.join(data_path, 'train_ratings.csv'), sep=',')
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

user_ids = data.user.unique()
item_ids = data.item.unique()

### Extract Year Data from Title Data

In [3]:
print('# of items Before preprocessing:', year_data['item'].nunique())

no_year_item_ids = set(item_ids) - set(year_data.loc[:,'item'])

cond = title_data['item'].isin(no_year_item_ids)
no_year_items = title_data[cond].copy()
no_year_items['year'] = no_year_items['title'].apply(lambda x:int(x[-5:-1])).values
year_data = pd.concat([year_data, no_year_items[['item', 'year']]], axis=0)

print('# of items After preprocessing:', year_data['item'].nunique())

# of items Before preprocessing: 6799
# of items After preprocessing: 6807


### Group Year Data

In [4]:
year_data['year'] = pd.cut(year_data['year'], list(range(1900, 2021, 10)))
year_dict = {year:i for i, year in enumerate(set(year_data['year']))}
year_data['year']  = year_data['year'].map(lambda x : 'y'+str(year_dict[x]))

### Remove Year Data from Title Data

In [5]:
title_data['title'] = title_data['title'].apply(lambda x:x[:-6])
title_data['title'] = title_data['title'].str.replace(pat=r'[^\w]',repl=r' ',regex=True)

### Group Genre Data by Item

In [6]:
item_genre_df = list(genre_data.groupby('item')['genre'])

items = list()
genres = list()

for item, genre_df in tqdm(item_genre_df):
    items.append(item)
    genres.append(' '.join(list(genre_df)))

genre_data = pd.DataFrame({'item': items, 'genre': genres})

100%|██████████| 6807/6807 [00:00<00:00, 132838.10it/s]


### Merge Year/Title/Genre Data

In [7]:
joined_data = pd.merge(year_data, title_data, left_on='item', right_on='item', how='inner')
joined_data = pd.merge(joined_data, genre_data, left_on='item', right_on='item', how='inner')
joined_data

Unnamed: 0,item,year,title,genre
0,1348,y5,Nosferatu Nosferatu eine Symphonie des Graue...,Horror
1,44587,y5,Nanook of the North,Documentary Drama
2,4768,y5,Dr Mabuse The Gambler Dr Mabuse der Spiel...,Crime Mystery Thriller
3,8235,y5,Safety Last,Action Comedy Romance
4,8609,y5,Our Hospitality,Comedy
...,...,...,...,...
6802,7243,y10,Intolerance Love s Struggle Throughout the Ages,Drama
6803,8511,y10,Immigrant The,Comedy
6804,6988,y10,Broken Blossoms or The Yellow Man and the Girl,Drama Romance
6805,32898,y9,Trip to the Moon A Voyage dans la lune Le,Action Adventure Fantasy Sci-Fi


In [8]:
def combined_features(row):
    return row['year']+' '+row['title']+' '+row['genre']

In [9]:
joined_data['combined'] = joined_data.apply(combined_features, axis =1)
joined_data.sort_values('item', inplace=True)

In [10]:
joined_data.to_csv(os.path.join(data_path, 'joined_data.csv'), index=False)

### Group Director Data by Item

In [11]:
item_director_df = list(director_data.groupby('item')['director'])

items = list()
directors = list()

for item, director_df in tqdm(item_director_df):
    items.append(item)
    directors.append(' '.join(list(director_df)))

director_data = pd.DataFrame({'item': items, 'director': directors})
director_data

100%|██████████| 5503/5503 [00:00<00:00, 149219.39it/s]


Unnamed: 0,item,director
0,1,nm0005124
1,2,nm0002653
2,3,nm0222043
3,4,nm0001845
4,5,nm0796124
...,...,...
5498,118696,nm0001392
5499,118900,nm0885249
5500,118997,nm0551128
5501,119141,nm0736622 nm1698571


### Group Writer Data by Item

In [12]:
item_writer_df = list(writer_data.groupby('item')['writer'])

items = list()
writers = list()

for item, writer_df in tqdm(item_writer_df):
    items.append(item)
    writers.append(' '.join(list(writer_df)))

writer_data = pd.DataFrame({'item': items, 'writer': writers})
writer_data

100%|██████████| 5648/5648 [00:00<00:00, 155558.81it/s]


Unnamed: 0,item,writer
0,1,nm0004056 nm0005124 nm0169505 nm0230032 nm0710...
1,2,nm0378144 nm0852430 nm0885575
2,3,nm0425756
3,4,nm0060103
4,5,nm0329304 nm0352443 nm0583600 nm0796124
...,...,...
5643,118696,nm0001392 nm0101991 nm0866058 nm0868219 nm0909638
5644,118900,nm0394984
5645,118997,nm0487567
5646,119141,nm0736622 nm1698571


### Merge Year/Title/Genre/Director/Writer Data

In [29]:
joined_data = pd.merge(year_data, title_data, left_on='item', right_on='item', how='inner')
joined_data = pd.merge(joined_data, genre_data, left_on='item', right_on='item', how='inner')
joined_data = pd.merge(joined_data, director_data, left_on='item', right_on='item', how='left')
joined_data = pd.merge(joined_data, writer_data, left_on='item', right_on='item', how='left')
for feature in ['director','writer']:
    joined_data[feature] = joined_data[feature].fillna('')
joined_data

Unnamed: 0,item,year,title,genre,director,writer
0,1348,y5,Nosferatu Nosferatu eine Symphonie des Graue...,Horror,nm0003638,nm0831290
1,44587,y5,Nanook of the North,Documentary Drama,,
2,4768,y5,Dr Mabuse The Gambler Dr Mabuse der Spiel...,Crime Mystery Thriller,nm0000485,nm0000485 nm0415167 nm0902376
3,8235,y5,Safety Last,Action Comedy Romance,nm0628345 nm0853130,nm0369841 nm0516001 nm0853130 nm0924065
4,8609,y5,Our Hospitality,Comedy,nm0000036,nm0115669 nm0369841 nm0593477
...,...,...,...,...,...,...
6802,7243,y10,Intolerance Love s Struggle Throughout the Ages,Drama,nm0000428,nm0000428 nm0002616 nm0115218 nm0940488
6803,8511,y10,Immigrant The,Comedy,nm0000122,nm0000122
6804,6988,y10,Broken Blossoms or The Yellow Man and the Girl,Drama Romance,nm0000428,nm0000428
6805,32898,y9,Trip to the Moon A Voyage dans la lune Le,Action Adventure Fantasy Sci-Fi,,nm0894523 nm0920229


In [30]:
def combined_features(row):
    return row['year']+' '+row['title']+' '+row['genre']+' '+row['director']+' '+row['writer']

In [31]:
joined_data['combined'] = joined_data.apply(combined_features, axis =1)
joined_data.sort_values('item', inplace=True)

In [32]:
joined_data.to_csv(os.path.join(data_path, 'joined_data-2.csv'), index=False)

### Inference

In [33]:
def preprocess(df) :
    print("preprocessing..")
    df = df.sort_values(['user', 'time'], ascending = [True, True])

    users = df['user'].unique()
    user_to_id = dict(zip(users, range(len(users))))
    id_to_user = {v: k for k, v in user_to_id.items()}
    
    movies = df['item'].unique()
    movie_to_id = dict(zip(movies, range(len(movies))))
    id_to_movie = {v: k for k, v in movie_to_id.items()}
    
    df['user'] = df['user'].apply(lambda x : user_to_id[x])
    df['item'] = df['item'].apply(lambda x : movie_to_id[x])

    return df, user_to_id, id_to_user, movie_to_id, id_to_movie

In [24]:
df = pd.read_csv(os.path.join(data_path, 'train_ratings.csv'), sep=',')
df, user_to_id, id_to_user, movie_to_id, id_to_movie = preprocess(df)
joined_data = pd.read_csv(os.path.join(data_path, 'joined_data-2.csv'))
joined_data['item'] = joined_data['item'].apply(lambda x : movie_to_id[x])
joined_data.sort_values('item', inplace=True)

preprocessing..


In [34]:
vectorizer = CountVectorizer(stop_words=["and", "is", "the", "of", "for", "to", "in", "on", "with", "from", "at"])
count_matrix = vectorizer.fit_transform(joined_data['combined'].values)
cosine_sim = cosine_similarity(count_matrix)
cosine_sim.shape

(6807, 6807)

In [37]:
word = vectorizer.get_feature_names_out()
freq = list(count_matrix.toarray().sum(axis=0))
word_df = pd.DataFrame({'word':word, 'freq':freq})
word_df.sort_values('freq', ascending=False)[:10]

Unnamed: 0,word,freq
1921,drama,3491
10885,y3,2493
1369,comedy,2413
10881,y1,1532
10169,thriller,1466
9126,romance,1255
134,action,1249
1527,crime,930
152,adventure,899
10884,y2,830


In [38]:
user_item_dfs = list(df.groupby('user')['item'])

In [39]:
sub_u = []
sub_i = []

for user_id, item_df in tqdm(user_item_dfs):
    result = cosine_sim[item_df].sum(axis=0)
    ranking = np.argsort(result)[::-1]

    pred = []
    for item_id in ranking :
        if item_id in item_df.values :
            continue
        u = id_to_user[int(user_id)]
        i = id_to_movie[int(item_id)]
        sub_u.append(u)
        sub_i.append(i)
        pred.append(i)
        if len(pred) == 10 :
            break

100%|██████████| 31360/31360 [02:01<00:00, 259.07it/s]


In [40]:
submission = {"user" : sub_u, "item" : sub_i}
submission_df = pd.DataFrame(submission)
submission_df.to_csv(f'/opt/ml/movie-recommendation/BPR/output/item-based-cossim-2.csv', index=False)