In [14]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

data_base_dir = '../../datasets/Movielens/'
data_dir2 = data_base_dir + 'Movielens Latest/ml-latest/'
data_dir = data_base_dir + 'ml-20m/'

genome_scores = data_dir + 'genome-scores.csv'
genome_tags = data_dir + 'genome-tags.csv'
movies = data_dir + 'movies.csv'
ratings = data_dir + 'ratings.csv'
tags = data_dir + 'tags.csv'


def map_stemmed_word(x, stemmed_word):
    x = x.split()

    for w in x:
        w1 = w.replace("(", "")
        w1 = w1.replace(")", "")

        if w1.startswith(stemmed_word):
            return True

    return False

def perform_stemming(stemming_phrases, genome_tags_df):
    stemming_dict = {}

    for sp in stemming_phrases:
        ending_with_s_condition = genome_tags_df.apply(lambda x: x['tag'].endswith(sp), axis=1).values

        candidate_words = genome_tags_df[ending_with_s_condition]['tag'].values

        for w in candidate_words:
            stemmed_word = w[:-len(sp)]

            if (len(stemmed_word) > 2):
                match_condition = genome_tags_df.apply(lambda x: map_stemmed_word(x['tag'], stemmed_word), axis=1)

                mapped_list = genome_tags_df[match_condition]['tag'].values.tolist()

                if len(mapped_list) > 0:
                    stemming_dict[stemmed_word] = genome_tags_df[match_condition]['tag'].values.tolist()

    return stemming_dict

def drop_singular_value_keys(stemming_dict):
    print("# of Dictionary keys before dropping singular values: %d" % len(stemming_dict))

    stemming_dict = {k: v for k, v in stemming_dict.items() if len(v) > 1}

    print("# of Dictionary keys after dropping singular values: %d" % len(stemming_dict))


def print_dict_value_count(stemming_dict):
    value_list = list(stemming_dict.values())

    count = 0

    for l in value_list:
        count += len(l)

    print(count)


def remove_redundant_keys(stem_dict):
    value_list = list(stem_dict.values())

    for l in value_list:
        for w in l:
            w = w.replace("(", "")
            w = w.replace(")", "")

            if w in stem_dict and len(stem_dict[w]) == 1:
                del stem_dict[w]



genome_tags_df = pd.read_csv(genome_tags)

# TODO think of better stemming phrases
stemming_phrases = ['ies', 's', 'ed', 'ion']

# perform stemming
stemming_dict = perform_stemming(stemming_phrases, genome_tags_df)

print("stemming dict 1, before removing redundant")
print(stemming_dict)
print_dict_value_count(stemming_dict)
remove_redundant_keys(stemming_dict)
print("stemming dict 1, after removing redundant")
print(stemming_dict)
print_dict_value_count(stemming_dict)

# TODO apply similar value mapping for remaining values
# all values from dictionary
all_dict_values = list()
for l in list(stemming_dict.values()):
    #     for v in l:
    all_dict_values.extend(l)

all_dict_values = np.array(all_dict_values)
all_tags = genome_tags_df['tag'].values

# difference
remaining_tags = np.setdiff1d(all_tags, all_dict_values)

# TODO remove or keep
# remaining_tags = all_tags

# TODO now apply similar value mapping for remaining tags
new_stemming_dict = {}

for stemmed_word in remaining_tags:
    match_condition = genome_tags_df.apply(lambda x: map_stemmed_word(x['tag'], stemmed_word), axis=1)
    mapped_list = genome_tags_df[match_condition]['tag'].values.tolist()

    if len(mapped_list) > 0:
        new_stemming_dict[stemmed_word] = mapped_list

print(new_stemming_dict)

print_dict_value_count(new_stemming_dict)
# remove_redundant_keys(new_stemming_dict)
# print("<<<............Afer removing redundant keys............>>>")
# print_dict_value_count(new_stemming_dict)
print(new_stemming_dict)

stemming dict 1, before removing redundant
{'ser': ['007 (series)', 'secret service', 'serial killer', 'series'], 'zomb': ['zombie', 'zombies'], '1920': ['1920s'], '1930': ['1930s'], '1950': ['1950s'], '1960': ['1960s'], '1970': ['1970s'], '1980': ['1980s'], 'aid': ['aids'], 'alien': ['alien', 'alien invasion', 'aliens'], 'android': ['android(s)/cyborg(s)', 'androids'], 'animal': ['animal movie', 'animals', 'talking animals'], 'assassin': ['assassin', 'assassination', 'assassins'], 'astronaut': ['astronauts'], 'beatle': ['beatles'], 'bird': ['birds'], 'blindnes': ['blindness'], 'bomb': ['bombs', 'nuclear bomb'], 'book': ['based on a book', 'based on book', 'book', 'book was better', 'books', 'comic book', 'comic book adaption'], 'brother': ['brothers', 'coen brothers', 'marx brothers'], 'busines': ['business', 'movie business', 'music business'], 'car': ['car chase', 'carrie-anne moss', 'cars', 'cartoon', 'classic car'], 'ches': ['chess'], 'christma': ['christmas'], 'circu': ['circus']

{'007': ['007', '007 (series)'], '3d': ['3d'], '70mm': ['70mm'], '80s': ['80s'], '9/11': ['9/11'], 'aardman': ['aardman', 'aardman studios'], 'absurd': ['absurd'], 'adolescence': ['adolescence'], 'adultery': ['adultery'], 'adventure': ['adventure'], 'affectionate': ['affectionate'], 'africa': ['africa', 'south africa'], 'afterlife': ['afterlife'], 'aging': ['aging'], 'airplane': ['airplane'], 'airport': ['airport'], 'alaska': ['alaska'], 'alcatraz': ['alcatraz'], 'alcoholism': ['alcoholism'], 'allegory': ['allegory'], 'almodovar': ['almodovar'], 'amnesia': ['amnesia'], 'anime': ['anime'], 'antarctica': ['antarctica'], 'anti-hero': ['anti-hero'], 'anti-semitism': ['anti-semitism'], 'anti-war': ['anti-war'], 'apocalypse': ['apocalypse'], 'archaeology': ['archaeology'], 'argentina': ['argentina'], 'arnold': ['arnold'], 'art': ['art', 'art house', 'artificial intelligence', 'artist', 'artistic', 'artsy', 'con artists', 'king arthur', 'martial arts'], 'artist': ['artist', 'artistic', 'con a

In [15]:
# 156 keys
dict1_keys = list(stemming_dict.keys())

# 133 keys
dict2_keys = list(new_stemming_dict.keys())

all_keys = np.union1d(dict1_keys, dict2_keys)
len(all_keys)

695

In [16]:
len(new_stemming_dict.keys())

539

In [17]:
# merge two dictionaries
z = {**stemming_dict, **new_stemming_dict}
z

{'ser': ['007 (series)', 'secret service', 'serial killer', 'series'],
 'zomb': ['zombie', 'zombies'],
 '1920': ['1920s'],
 '1930': ['1930s'],
 '1950': ['1950s'],
 '1960': ['1960s'],
 '1970': ['1970s'],
 '1980': ['1980s'],
 'aid': ['aids'],
 'alien': ['alien', 'alien invasion', 'aliens'],
 'android': ['android(s)/cyborg(s)', 'androids'],
 'animal': ['animal movie', 'animals', 'talking animals'],
 'assassin': ['assassin', 'assassination', 'assassins'],
 'astronaut': ['astronauts'],
 'beatle': ['beatles'],
 'bird': ['birds'],
 'blindnes': ['blindness'],
 'bomb': ['bombs', 'nuclear bomb'],
 'book': ['based on a book',
  'based on book',
  'book',
  'book was better',
  'books',
  'comic book',
  'comic book adaption'],
 'brother': ['brothers', 'coen brothers', 'marx brothers'],
 'busines': ['business', 'movie business', 'music business'],
 'car': ['car chase', 'carrie-anne moss', 'cars', 'cartoon', 'classic car'],
 'ches': ['chess'],
 'christma': ['christmas'],
 'circu': ['circus'],
 'clo

In [18]:
len(z.keys())

695

In [34]:
print_dict_value_count(z)
final_stemming_dict = z

1132


In [23]:
#preparing tag_genomes mapping
genome_tags_df = pd.read_csv(genome_tags)
genome_tags_df.set_index(genome_tags_df['tagId'].values, drop=True, inplace=True)
genome_tags_df.drop(labels='tagId', axis=1, inplace=True)
genome_tags_df


genome_scores_df = pd.read_csv(genome_scores)
genome_scores_df = genome_scores_df.pivot(index='movieId', columns='tagId', values='relevance')
genome_scores_df

tagId,1,2,3,4,5,6,7,8,9,10,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.02500,0.02500,0.05775,0.09675,0.14675,0.21700,0.06700,0.26275,0.26200,0.03200,...,0.03950,0.01800,0.04575,0.03275,0.12500,0.04150,0.01925,0.03625,0.07775,0.02300
2,0.03975,0.04375,0.03775,0.04800,0.11025,0.07250,0.04775,0.10975,0.09925,0.02050,...,0.04175,0.01925,0.01725,0.02425,0.12550,0.02250,0.01550,0.01475,0.09025,0.01875
3,0.04350,0.05475,0.02800,0.07700,0.05400,0.06850,0.05600,0.18500,0.04925,0.02675,...,0.04150,0.02675,0.02775,0.03425,0.15550,0.03675,0.01700,0.01950,0.09700,0.01850
4,0.03725,0.03950,0.03675,0.03100,0.06825,0.04050,0.02325,0.08700,0.05125,0.03025,...,0.05750,0.03375,0.02275,0.03975,0.18525,0.05925,0.01500,0.01525,0.06450,0.01300
5,0.04200,0.05275,0.05925,0.03675,0.07525,0.12525,0.02850,0.08500,0.02950,0.02875,...,0.04250,0.02825,0.02150,0.02600,0.14275,0.02075,0.01650,0.01675,0.10750,0.01825
6,0.02825,0.02550,0.01850,0.04550,0.09575,0.05500,0.04400,0.24200,0.12850,0.02550,...,0.04900,0.01825,0.02075,0.06000,0.29975,0.15525,0.03525,0.01950,0.06650,0.01900
7,0.04575,0.05275,0.16675,0.08275,0.11450,0.15625,0.05025,0.11175,0.03950,0.08000,...,0.03750,0.02825,0.01200,0.03575,0.13000,0.04875,0.01975,0.01050,0.10925,0.01850
8,0.03075,0.03550,0.04675,0.02175,0.05600,0.03650,0.01675,0.07325,0.02950,0.04100,...,0.03700,0.01925,0.01625,0.02325,0.20975,0.02825,0.01675,0.01125,0.07000,0.01500
9,0.03500,0.04050,0.01825,0.01800,0.03650,0.01750,0.01300,0.04225,0.01675,0.01525,...,0.02225,0.01075,0.01175,0.01525,0.14100,0.02225,0.01100,0.00700,0.07275,0.01550
10,0.99975,0.99975,0.01950,0.03675,0.06675,0.05450,0.04550,0.12950,0.08550,0.01925,...,0.46750,0.02325,0.02150,0.03125,0.18400,0.03750,0.01775,0.01775,0.07300,0.01825


In [97]:
# slower approach
from time import time
import swifter

new_keys = list(final_stemming_dict.keys())

stemmed_tag_relevance_df = pd.DataFrame(columns=sorted(new_keys))

start_time = time()

# TODO remove limit on # of movies
for movie in genome_scores_df.index[:5]:
    series = pd.Series()
    
    for key in new_keys:
        # select tag ID's for values from the list
        mapped_values = np.array(final_stemming_dict[key])
        mapped_tag_ids = genome_tags_df[genome_tags_df.isin(mapped_values)].dropna().index.values
        
        # calculate the target sum for underlying tags
        target_sum = genome_scores_df.loc[movie, mapped_tag_ids].sum()
        series[key] = target_sum
    
    stemmed_tag_relevance_df = stemmed_tag_relevance_df.append(series, ignore_index=True)
    
finish = time() - start_time

print("Total time taken %f" % finish + " seconds")
    
stemmed_tag_relevance_df

Total time taken 11.261237 seconds


Unnamed: 0,007,1920,1930,1950,1960,1970,1980,3d,70mm,80s,...,witty,wizard,women,workplace,writer,writing,wuxia,wwii,zomb,zombie
0,0.05,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.577,0.11625,0.188,...,0.77675,0.145,0.171,0.1025,0.15775,0.46825,0.01925,0.03625,0.10075,0.10075
1,0.0835,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.06775,0.089,0.22575,...,0.18,0.18725,0.198,0.057,0.14975,0.0705,0.0155,0.01475,0.109,0.109
2,0.09825,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02225,0.071,0.0905,...,0.1995,0.02825,0.5015,0.07325,0.18975,0.15175,0.017,0.0195,0.1155,0.1155
3,0.07675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,0.02125,0.03325,0.1175,...,0.10725,0.02825,1.0605,0.28825,0.225,0.13625,0.015,0.01525,0.0775,0.0775
4,0.09475,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,0.03125,0.0615,0.07175,...,0.12,0.02925,0.56125,0.0745,0.16875,0.05925,0.0165,0.01675,0.12575,0.12575


In [103]:
# faster approach

from time import time
import swifter

new_keys = list(final_stemming_dict.keys())

stemmed_tag_relevance_df2 = pd.DataFrame(index=genome_scores_df.index, columns=sorted(new_keys))

start_time = time()

def process(movie_ids, key):
    # select tag ID's for values from the list
    mapped_values = np.array(final_stemming_dict[key])
    mapped_tag_ids = genome_tags_df[genome_tags_df.isin(mapped_values)].dropna().index.values

    # calculate the target sum for underlying tags
    return genome_scores_df.loc[movie_ids, mapped_tag_ids].sum(axis=1)

stemmed_tag_relevance_df2 = stemmed_tag_relevance_df2.apply(lambda x: process(x.index, x.name))

# alternative for huge data, using swifter, to utilize multi-cores
# stemmed_tag_relevance_df2 = stemmed_tag_relevance_df2.swifter.apply(lambda x: process(x.index, x.name))

finish = time() - start_time

print("Total time taken %f" % finish + " seconds")

stemmed_tag_relevance_df2

Total time taken 2.083637 seconds


Unnamed: 0_level_0,007,1920,1930,1950,1960,1970,1980,3d,70mm,80s,...,witty,wizard,women,workplace,writer,writing,wuxia,wwii,zomb,zombie
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.05000,0.09675,0.14675,0.21700,0.06700,0.26275,0.26200,0.57700,0.11625,0.18800,...,0.77675,0.14500,0.17100,0.10250,0.15775,0.46825,0.01925,0.03625,0.10075,0.10075
2,0.08350,0.04800,0.11025,0.07250,0.04775,0.10975,0.09925,0.06775,0.08900,0.22575,...,0.18000,0.18725,0.19800,0.05700,0.14975,0.07050,0.01550,0.01475,0.10900,0.10900
3,0.09825,0.07700,0.05400,0.06850,0.05600,0.18500,0.04925,0.02225,0.07100,0.09050,...,0.19950,0.02825,0.50150,0.07325,0.18975,0.15175,0.01700,0.01950,0.11550,0.11550
4,0.07675,0.03100,0.06825,0.04050,0.02325,0.08700,0.05125,0.02125,0.03325,0.11750,...,0.10725,0.02825,1.06050,0.28825,0.22500,0.13625,0.01500,0.01525,0.07750,0.07750
5,0.09475,0.03675,0.07525,0.12525,0.02850,0.08500,0.02950,0.03125,0.06150,0.07175,...,0.12000,0.02925,0.56125,0.07450,0.16875,0.05925,0.01650,0.01675,0.12575,0.12575
6,0.05375,0.04550,0.09575,0.05500,0.04400,0.24200,0.12850,0.01550,0.04400,0.08000,...,0.40750,0.03375,0.18300,0.18800,0.35975,0.57725,0.03525,0.01950,0.08550,0.08550
7,0.09850,0.08275,0.11450,0.15625,0.05025,0.11175,0.03950,0.02250,0.08800,0.05725,...,0.28225,0.02975,0.52525,0.06950,0.16575,0.10400,0.01975,0.01050,0.12775,0.12775
8,0.06625,0.02175,0.05600,0.03650,0.01675,0.07325,0.02950,0.03025,0.02125,0.08000,...,0.09550,0.02825,0.43550,0.03875,0.23300,0.08375,0.01675,0.01125,0.08500,0.08500
9,0.07550,0.01800,0.03650,0.01750,0.01300,0.04225,0.01675,0.02225,0.02875,0.09950,...,0.07625,0.02275,0.20100,0.04950,0.15625,0.06100,0.01100,0.00700,0.08825,0.08825
10,1.99950,0.03675,0.06675,0.05450,0.04550,0.12950,0.08550,0.02150,0.07225,0.23225,...,0.26050,0.04150,0.24900,0.07225,0.21525,0.11575,0.01775,0.01775,0.09125,0.09125


In [104]:
# save output to csv file
stemmed_tag_relevance_df2.to_csv(data_dir + '/movies_stemmed_tag_relevance.csv')