In [2]:
import pandas as pd
import numpy as np
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models.word2vec import LineSentence
from nltk.corpus import stopwords
import os



#### build dataset

In [59]:
df = pd.read_csv('data//user_reviews_by_movies.csv',delimiter = '|',engine = 'python')
df.shape


(172187, 7)

#### Extract all the words from user_reviews summary and store as a list of words

In [3]:
def build_data(extract_from,save_to,features_to_extract = []):
    '''
    Extract the user reviews summary data from dataset and store user reviews as a list of lists.
    Args:
        dfname: name of dataset whose user reviews summary needs to be extracted.
                Valid values are ['movies','games','music','tv']
    Returns:
        A list of all the sentences/reviews found in user reviews summary.
    '''
    
    if len(features_to_extract):
        filename = os.path.join('data',extract_from)
        df = pd.read_csv(filename,delimiter = '|',engine = 'python',usecols=features_to_extract)
        df.to_csv()
        
        
    else:
        #filename = os.path.join('data','user_reviews_by_{}.csv').format(dfname)
        filename = os.path.join('data',extract_from)
        df = pd.read_csv(filename,delimiter = '|',engine = 'python')
    
    #arr = []
    #filename = os.path.join('data','summary_{}.txt').format(dfname)
    return df
    #arr = np.array(df.summary)
    #print(arr)
    #np.savetxt(filename,arr,fmt='%-5s',encoding='utf-8')
    
    print('Summary Data successfully saved in file:"{}"'.format(filename))
    
   

In [44]:
df = build_data('user_reviews_by_movies.csv','user_item_summary.csv',['user_id','token_name','summary'])
df.head(20)

Unnamed: 0,user_id,token_name,summary
0,ilmi_,citizen-kane,Yang kasih rate 0-3 berarti sengaja menjatuhka...
1,GAMERGUYCASUAL,citizen-kane,"This movie is incredibly boring, and while it ..."
2,Vladthepoker,citizen-kane,"My personal favorite movie of all time, Orson ..."
3,JoseAngel47,citizen-kane,Rosebud is the Name of his sled. There I just ...
4,Jalumbi,citizen-kane,While the third act isn't the perfectly built ...
5,MasterRiley,citizen-kane,"Citizen Kane is a remarkable work of cinema, o..."
6,1996,citizen-kane,"Wonderful, marvelous, glorious, fabulous... th..."
7,BeastJ18,citizen-kane,"Intelligent, ground-breaking, a great filmic l..."
8,GabrielHU3ttdz6,citizen-kane,"It's a really nice movie, but ""The Best Movie ..."
9,wiiy71,citizen-kane,xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...


In [47]:
print(df.shape)
print(df.isnull().sum())

(229711, 3)
user_id           0
token_name    57524
summary       57573
dtype: int64


In [97]:
test = build_data('user_reviews_by_test.csv','user_item_summary_test.csv',['user_id','token_name','summary'])
test

Unnamed: 0,user_id,token_name,summary
0,MrNobody19884,rectify/season-4,Just finished watching 1st episode of season 4...
1,moviebuffers,rectify/season-4,simply one of the greatest works of fiction ev...
2,dan547,rectify/season-4,Beautiful. The most apt word to describe this ...
3,toph123,rectify/season-4,This TV show is criminally underseen. One of t...
4,JorgeLestre,rectify/season-4,Rectify is the most humane show in the history...
5,volpino,rectify/season-4,"This entire show is so full of meaning, is so ..."
6,bertus,rectify/season-4,Rectify will forever stay in my heart as an ab...
7,TheZeroPercent,rectify/season-4,"I checked this show out on Netflix, I ended up..."
8,Bangstad,rectify/season-4,Breaking Bad is acclaimed at least in part bec...
9,s17k,rectify/season-4,The finest show I've ever watched. Deeply emot...


In [92]:
#CLEAN NULL VALUES FROM DATASET
df1 = df.dropna(axis = 0,how='any')
#Inject user and item in summary
df1.isnull().sum()

user_id       0
token_name    0
summary       0
dtype: int64

In [85]:
#this function receives a listoflist object
def inject_user_item_in_summary(data,inject_window_size = 15,save_to = 'ui_injection_test.txt'):
    #iterate over rows of dataset
    for index,row in data.iterrows():
        u = row['user_id']
        i = row['token_name']
        lst = row['summary'].split()
        #print(lst)
        idx = 0
        while(idx <=len(lst)):
            [lst.insert(idx,a) for a in [i,u]]
            idx +=inject_window_size
        row['summary'] = ' '.join(lst)
    
    arr = np.array(data.summary)
    #print(arr)
    filename = os.path.join('data',save_to)
    np.savetxt(filename,arr,fmt='%-5s',encoding='utf-8')
    print('Injection File successfully saved.')

In [93]:
inject_user_item_in_summary(df1,inject_window_size=15)

Injection File successfully saved.


In [98]:
inject_user_item_in_summary(test,inject_window_size=15)

Injection File successfully saved.


In [118]:
build_data('movies')

Summary Data successfully saved in file:"data/summary_movies.txt"


In [44]:
build_data('tv')

Summary Data successfully saved in file:"data/summary_tv.txt"


In [44]:
vocab = extract_summary_data('tv')

In [46]:
len(vocab)

3811645

In [64]:
def map_words(word_list):
    '''
    function maps word list to numbers. Unique words have unique ids.
    Args:
        word_list: list of words to map to numbers
    Returns:
        A number List with words mapped as numbers, word-number mapping dictionary and number-word mapping dictionary.
    '''
    #find unique words
    unique_words = set(word_list)
    #create hashing dict for unique words
    map_dict = {}
    reverse_map_dict = {}
    #assign IDs to unique words
    for ID,word in enumerate(unique_words):
        map_dict[word] = ID
        reverse_map_dict[ID] = word

    #apply the hashing on the vocab data
    data = list()
    for word in word_list:
        index = map_dict[word]
        data.append(index)

    return data,map_dict,reverse_map_dict

In [65]:
word_idx,map_dict,reverse_map_dict = map_words(vocab)

## Word2Vec using lib gensim

In [41]:
def read_data(dfname):
    
    #load data from file
    filename = 'data//summary_{}.txt'.format(dfname)
    with open(filename,'rb') as f:
        for line in f:
            yield gensim.utils.simple_preprocess(line)
    
    #df = pd.read_csv(filename,delimiter = '|',engine = 'python')
    
    #initialize string var
    #string = ''
    #concat all the summary text if it is not null
    #summary = [str(summary) for summary in df.summary if (summary !=None)]
    #arr = np.array(summary)
    #string = str(summary).replace('\n','')
    #return [gensim.utils.simple_preprocess(string)]

In [48]:
lst = list(read_data('test2'))

In [49]:
len(lst)

3

In [4]:
import shutil

In [6]:
#combine summaries of all datasets into a single file
dfname = ['movies','games','music','tv']
outfile = 'data//summary_all.txt'
with open(outfile,'wb') as out:
    for fname in dfname:
        infile = 'data//summary_{}.txt'.format(fname)
        with open(infile,'rb') as f:
            shutil.copyfileobj(f,out)

### Word2Vec for 'TV'

In [5]:
#build_data('tv')
data = list(read_data('tv'))

In [6]:
len(data)

65599

In [46]:
data[:3]

[['just',
  'finished',
  'watching',
  'st',
  'episode',
  'of',
  'season',
  'never',
  'have',
  'ever',
  'watched',
  'anything',
  'as',
  'emotionally',
  'mentally',
  'engaging',
  'as',
  'this',
  'have',
  'so',
  'far',
  'watched',
  'some',
  'of',
  'the',
  'best',
  'shows',
  'on',
  'tv',
  'likes',
  'of',
  'sopranos',
  'the',
  'wire',
  'mad',
  'men',
  'breaking',
  'bad',
  'homicide',
  'life',
  'on',
  'the',
  'streets',
  'despite',
  'very',
  'modest',
  'premise',
  'rectify',
  'has',
  'been',
  'the',
  'best',
  'tv',
  'series',
  'to',
  'showcase',
  'the',
  'human',
  'connection',
  'of',
  'sorts',
  'in',
  'very',
  'real',
  'profound',
  'way',
  'possible'],
 ['simply',
  'one',
  'of',
  'the',
  'greatest',
  'works',
  'of',
  'fiction',
  'ever',
  'filmed',
  'the',
  'one',
  'show',
  'in',
  'history',
  'that',
  'will',
  'cause',
  'you',
  'to',
  'go',
  'back',
  'and',
  'ponder',
  'each',
  'scene',
  'never',
  'mi

In [47]:
m2 = gensim.models.Word2Vec (data,size = 100, min_count = 1, window = 2,workers = 2,sg=1)
m2.train(data,total_examples = len(data),epochs=10)

(27761227, 36454710)

In [48]:
m2.wv.most_similar(positive='never')

  if np.issubdtype(vec.dtype, np.int):


[('rarely', 0.672122597694397),
 ('hasnt', 0.6523046493530273),
 ('havenâ', 0.6456286311149597),
 ('haven', 0.64161616563797),
 ('havent', 0.6347654461860657),
 ('ever', 0.6313737034797668),
 ('videodrome', 0.6168889999389648),
 ('havn', 0.6090279221534729),
 ('havnt', 0.6024558544158936),
 ('wouldnt', 0.6001440286636353)]

In [49]:
m2.wv.most_similar('engaging')

  if np.issubdtype(vec.dtype, np.int):


[('compelling', 0.8460767865180969),
 ('intriguing', 0.8417369723320007),
 ('entertaining', 0.8291188478469849),
 ('interesting', 0.815762460231781),
 ('engrossing', 0.7920169234275818),
 ('captivating', 0.790175199508667),
 ('suspenseful', 0.7781546115875244),
 ('enjoyable', 0.7746325731277466),
 ('absorbing', 0.7668488621711731),
 ('inventive', 0.7593327760696411)]

### Word2Vec for Movies

In [None]:
build_data('movies')

In [7]:
movieData = list(read_data('movies'))

In [8]:
len(movieData)

267827

In [120]:
movieData[:3]

[['yang',
  'kasih',
  'rate',
  'berarti',
  'sengaja',
  'menjatuhkan',
  'karya',
  'orang',
  'lain',
  'lu',
  'pikir',
  'bikin',
  'film',
  'itu',
  'mudah',
  'lu',
  'pikir',
  'penilaian',
  'sebuah',
  'film',
  'bagus',
  'atau',
  'tidak',
  'cuma',
  'dinilai',
  'dari',
  'plot',
  'nya',
  'ceritanya',
  'saja',
  'misalnya',
  'kalo',
  'mau',
  'kontribusi',
  'kasih',
  'rate',
  'yang',
  'objektif',
  'lah',
  'pertimbangkan',
  'aspek',
  'lain',
  'dalam',
  'film',
  'tersebut',
  'unsur',
  'intrinsik',
  'film',
  'saya',
  'pribadi',
  'sangat',
  'tidak',
  'suka',
  'dengan',
  'orang',
  'yang',
  'tidak',
  'menghargai',
  'karya',
  'orang',
  'lain',
  'dengan',
  'melakukan',
  'penilaian',
  'hanya',
  'dari',
  'aspek',
  'tertentu',
  'tidak',
  'menilainya',
  'secara',
  'keseluruhan',
  'semua',
  'aspek',
  'yg',
  'ada',
  'dinilai',
  'serta',
  'memberikan',
  'penilaian',
  'secara',
  'obyektif',
  'bukannya',
  'berdasarkan',
  'emosi',
 

In [122]:
model_movies = gensim.models.Word2Vec(movieData,size =100, window =2,workers=5,sg=1)

### Word2Vec for games reviews

In [123]:
build_data('games')

Summary Data successfully saved in file:"data/summary_games.txt"


In [9]:
gamesData = list(read_data('games'))

In [10]:
len(gamesData)

1091406

In [125]:
model_games = gensim.models.Word2Vec(gamesData,size =300, window =2,workers=5,sg=1)

### Word2Vec for music reviews

In [126]:
build_data('music')

Summary Data successfully saved in file:"data/summary_music.txt"


In [11]:
musicData = list(read_data('music'))

In [12]:
len(musicData)

155746

In [128]:
model_music = gensim.models.Word2Vec(musicData,size = 150,window=2,workers=5,sg=1)

### Word2Vec for all datasets combined

In [9]:
data = list(read_data('all'))

In [10]:
len(data)

1580578

In [None]:
# model to create and optimize
#model_all = gensim.models.Word2Vec(data)

### Train models

In [129]:
model_movies.train(movieData,total_examples = len(movieData),epochs=10)

(148515593, 195400030)

In [130]:
model_games.train(gamesData,total_examples=len(gamesData),epochs=10)

(352055544, 471441910)

In [131]:
model_music.train(musicData,total_examples=len(musicData),epochs=10)

(53660828, 71202630)

### Test models

In [132]:
model_movies.wv.most_similar('good')

  if np.issubdtype(vec.dtype, np.int):


[('great', 0.8382713198661804),
 ('decent', 0.8184181451797485),
 ('bad', 0.7779989242553711),
 ('solid', 0.7406041622161865),
 ('nice', 0.7196515202522278),
 ('goood', 0.7007834911346436),
 ('cool', 0.6978937387466431),
 ('fantastic', 0.6814126968383789),
 ('awesome', 0.6744445562362671),
 ('fine', 0.6724880933761597)]

In [133]:
model_games.wv.most_similar('bloody')

  if np.issubdtype(vec.dtype, np.int):


[('gory', 0.5058069229125977),
 ('grusome', 0.47257423400878906),
 ('damn', 0.4709516167640686),
 ('freaking', 0.4672619104385376),
 ('brutal', 0.43675684928894043),
 ('ckin', 0.43485718965530396),
 ('violent', 0.42983636260032654),
 ('gorey', 0.42204925417900085),
 ('gore', 0.4160231351852417),
 ('goddamn', 0.41564396023750305)]

In [134]:
model_music.wv.most_similar('rocking')

  if np.issubdtype(vec.dtype, np.int):


[('rockin', 0.613544225692749),
 ('jammy', 0.6084136962890625),
 ('churning', 0.5859770178794861),
 ('pumping', 0.5426751375198364),
 ('crunching', 0.5380844473838806),
 ('pumpin', 0.5263203382492065),
 ('rocker', 0.5259642601013184),
 ('headbanging', 0.5214914083480835),
 ('swinging', 0.5204938650131226),
 ('grooving', 0.5151990056037903)]

In [135]:
model_music.wv.most_similar('peaceful')

  if np.issubdtype(vec.dtype, np.int):


[('relaxing', 0.6438614726066589),
 ('ethereal', 0.6270212531089783),
 ('sensuous', 0.6160961389541626),
 ('ghostly', 0.6077392101287842),
 ('airy', 0.607119619846344),
 ('methodical', 0.6056900024414062),
 ('heartrending', 0.6018016934394836),
 ('dreamlike', 0.5949757695198059),
 ('vibey', 0.594833254814148),
 ('calm', 0.5924388766288757)]

### save models

In [140]:
m2.save('data//word2vec_tv.model')

In [141]:
model_movies.save('data//word2vec_movies.model')

In [142]:
model_games.save('data//word2vec_games.model')

In [143]:
model_music.save('data//word2vec_music.model')

In [101]:
from importlib import reload

In [94]:
import Word2Vec

In [None]:
def word_keep_rule(word, word_count, min_count):
    """
    This function is used only by gensim's word2vec. A rule to decide whether to keep a word or discard it in gensim.word2vec model.
    This rule is used to keep injected users and items in the word2vec vocabulary. We want all of these so that we can recommend things to any user or item.
    Gensim's word2vec will call this rule on every vocab word to decide if its kept. 
    If the word is prefixed by 'u:' or 'm:', we keep it. Otherwise, we choose the default behavior (which is: discard it if word_count < min_count).
    For a reference on this default behavior, see
    https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py#L404
    """
    if word[:1] == 'u' or word[:1] == 'i':
        return gensim.utils.RULE_KEEP
    return gensim.utils.RULE_DEFAULT

In [102]:
reload(Word2Vec)

<module 'Word2Vec' from 'C:\\Users\\Gurpreet\\Documents\\Coding practices\\Python\\Independent Study\\Word2Vec.py'>

In [107]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gurpreet\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [108]:
data = Word2Vec.ready_data('movies_injected')

In [109]:
data[:3]

[['ilmi_',
  'citizen',
  'kane',
  'ilmi_',
  'citizen',
  'kane',
  'yang',
  'kasih',
  'rate',
  'berarti',
  'sengaja',
  'menjatuhkan',
  'karya',
  'orang',
  'lain',
  'lu',
  'ilmi_',
  'citizen',
  'kane',
  'pikir',
  'bikin',
  'ilmi_',
  'citizen',
  'kane',
  'film',
  'itu',
  'mudah',
  'lu',
  'pikir',
  'penilaian',
  'sebuah',
  'film',
  'ilmi_',
  'citizen',
  'kane',
  'bagus',
  'atau',
  'tidak',
  'cuma',
  'ilmi_',
  'citizen',
  'kane',
  'dinilai',
  'dari',
  'plot',
  'nya',
  'ceritanya',
  'saja',
  'ilmi_',
  'citizen',
  'kane',
  'misalnya',
  'kalo',
  'mau',
  'kontribusi',
  'kasih',
  'ilmi_',
  'citizen',
  'kane',
  'rate',
  'yang',
  'objektif',
  'lah',
  'pertimbangkan',
  'ilmi_',
  'citizen',
  'kane',
  'aspek',
  'lain',
  'dalam',
  'film',
  'tersebut',
  'unsur',
  'intrinsik',
  'film',
  'ilmi_',
  'citizen',
  'kane',
  'saya',
  'pribadi',
  'sangat',
  'ilmi_',
  'citizen',
  'kane',
  'tidak',
  'suka',
  'dengan',
  'orang',
  

In [110]:
#create model
model = gensim.models.Word2Vec(sentences=data,size = 300,alpha=0.05,window=4,min_count=5,workers=5,sg=1,hs=0,negative=10)

#train model
model.train(data,total_examples=len(data),epochs=5)

(65454741, 69173480)

In [None]:
#Test Model
model.wv.most_similar('citizen')

In [111]:
model.wv.most_similar('superman')

  if np.issubdtype(vec.dtype, np.int):


[('batman', 0.8830540180206299),
 ('justice', 0.8458367586135864),
 ('dawn', 0.7178886532783508),
 ('luthor', 0.6165639758110046),
 ('lex', 0.6093013286590576),
 ('bvs', 0.5565556883811951),
 ('steel', 0.5216608047485352),
 ('chestercheeto', 0.5147061347961426),
 ('batfleck', 0.5068020820617676),
 ('ifoundmycookie', 0.5067472457885742)]