In [2]:
import torch
import torch.nn as nn
import pandas as pd
import re
import gensim

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
df = pd.read_csv('../data/New_Delhi_reviews.csv')
df.head()
df.info()
len(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147581 entries, 0 to 147580
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   rating_review  147581 non-null  int64 
 1   review_full    147579 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


147581

In [5]:
df.astype(str)
df.dropna(inplace=True)

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

df.review_full.apply(remove_emojis)
print(len(df))

147579


In [28]:
df.astype(str)

def preprocess(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub('[^\w\s]','',text)
    text = text.lower()
    text = text.replace('â€™','\'')
    return text 

df.review_full = df.review_full.apply(preprocess)
review_full = df.review_full.apply(gensim.utils.simple_preprocess)
#df.rating_review = df.rating_review.apply(gensim.utils.simple_preprocess)

review_full.to_csv('../data/New_Delhi_review_full_preprocessed.csv', header=True, index=False)
print(review_full)

0         [totally, in, love, with, the, auro, of, the, ...
1         [went, this, bar, days, regularly, with, my, h...
2         [we, were, few, friends, and, was, birthday, c...
3         [fatjar, cafe, and, market, is, the, perfect, ...
4         [hey, guys, if, you, are, craving, for, pizza,...
                                ...                        
147576    [near, by, airport, very, calm, and, cool, env...
147577    [my, favourite, place, to, stay, great, servic...
147578    [good, food, with, nice, decoration, drinks, l...
147579    [near, to, airport, it, is, fine, property, st...
147580    [amazing, food, excellent, ambience, great, se...
Name: review_full, Length: 147579, dtype: object


In [29]:

w2v_model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=8,
)

w2v_model.save("../models/word2vec.model")

In [31]:
print(review_full)

0         [totally, in, love, with, the, auro, of, the, ...
1         [went, this, bar, days, regularly, with, my, h...
2         [we, were, few, friends, and, was, birthday, c...
3         [fatjar, cafe, and, market, is, the, perfect, ...
4         [hey, guys, if, you, are, craving, for, pizza,...
                                ...                        
147576    [near, by, airport, very, calm, and, cool, env...
147577    [my, favourite, place, to, stay, great, servic...
147578    [good, food, with, nice, decoration, drinks, l...
147579    [near, to, airport, it, is, fine, property, st...
147580    [amazing, food, excellent, ambience, great, se...
Name: review_full, Length: 147579, dtype: object


In [32]:
w2v_model.build_vocab(review_full, progress_per=1000)

In [None]:
w2v_model.corpus_count
w2v_model.epochs

5

In [33]:
w2v_model.train(review_full, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
w2v_model.save("../models/word2vec_trained1.model")

In [38]:
w2v_model.wv.most_similar('restaurant')

[('resturant', 0.8491902947425842),
 ('restuarant', 0.7860949635505676),
 ('restraunt', 0.7539316415786743),
 ('restaurent', 0.7270158529281616),
 ('eatery', 0.723108172416687),
 ('place', 0.7185909152030945),
 ('establishment', 0.6876535415649414),
 ('restro', 0.6711159348487854),
 ('hotel', 0.6590299606323242),
 ('resto', 0.6424369215965271)]

In [46]:
from gensim.models import KeyedVectors

word_vectors = w2v_model.wv
word_vectors.save('../vectors/word2vec.wordvectors')

In [56]:
from pandarallel import pandarallel
from torch.utils.data import DataLoader
import numpy as np
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_dataset, test_dataset = torchtext.datasets.AG_NEWS()

tokenizer = get_tokenizer('basic_english')

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)
            
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=1, specials=["<UNK>"])

vocab.set_default_index(vocab['<UNK>'])

len(vocab)

AttributeError: 'NoneType' object has no attribute 'Lock'
This exception is thrown by __iter__ of _MemoryCellIterDataPipe(remember_elements=1000, source_datapipe=_ChildDataPipe)

In [42]:
#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

len of train:  132822
len of val:  14757


In [44]:
#create the sets
train = review_full.iloc[train_idx].reset_index().drop('index',axis=1)
val = review_full.iloc[val_idx].reset_index().drop('index',axis=1)

In [54]:
wv = KeyedVectors.load('../vectors/word2vec.wordvectors')
wv['food']

array([ 1.2322295 , -2.533263  ,  4.8991046 ,  0.5825057 , -1.5545931 ,
        0.40067062,  0.86317253, -2.0588245 , -0.6580092 , -0.67514783,
        1.9956131 , -0.58563995, -0.6706407 , -0.03462851, -1.3572003 ,
        2.6702332 ,  2.540665  ,  1.7313213 , -4.3757763 ,  0.9102214 ,
       -0.15559351,  1.6675105 ,  0.96926063,  0.04930129, -0.09162337,
        3.1976    , -0.52714574,  5.159618  ,  1.3916937 ,  1.7566911 ,
       -2.0933444 , -1.8641555 ,  0.00623717,  1.3983766 , -0.5653813 ,
       -3.8838038 , -1.9897114 ,  0.4943971 ,  0.9171134 ,  2.327649  ,
        1.5870931 , -0.6732834 ,  1.1467767 ,  0.8152067 ,  1.1898382 ,
       -2.271523  , -2.9042141 , -1.4692385 ,  0.86227393,  2.2149816 ,
       -3.0892186 ,  1.4353077 ,  2.8152444 ,  0.37636015,  1.8115354 ,
        1.684896  , -2.0344048 ,  0.37378323,  0.16243279,  0.37646717,
        1.6196756 , -1.9874151 , -0.42933232,  4.313935  ,  0.92841923,
       -1.4316587 ,  0.07810935,  3.254333  ,  1.33098   , -1.56

In [None]:
class Vocabulary:
  
    '''
    __init__ method is called by default as soon as an object of this class is initiated
    we use this method to initiate our vocab dictionaries
    '''
    def __init__(self, freq_threshold, max_size):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
        max_size : max source vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        #initiate the index to token dict
        ## <PAD> -> padding, used for padding the shorter sentences in a batch to match the length of longest sentence in the batch
        ## <SOS> -> start token, added in front of each sentence to signify the start of sentence
        ## <EOS> -> End of sentence token, added to the end of each sentence to signify the end of sentence
        ## <UNK> -> words which are not found in the vocab are replace by this token
        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        #initiate the token to index dict
        self.stoi = {k:j for j,k in self.itos.items()} 
        
        self.freq_threshold = freq_threshold
        self.max_size = max_size
    
    '''
    __len__ is used by dataloader later to create batches
    '''
    def __len__(self):
        return len(self.itos)
    
    '''
    a simple tokenizer to split on space and converts the sentence to list of words
    '''
    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    
    '''
    build the vocab: create a dictionary mapping of index to string (itos) and string to index (stoi)
    output ex. for stoi -> {'the':5, 'a':6, 'an':7}
    '''
    def build_vocabulary(self, sentence_list):
        #calculate the frequencies of each word first to remove the words with freq < freq_threshold
        frequencies = {}  #init the freq dict
        idx = 4 #index from which we want our dict to start. We already used 4 indexes for pad, start, end, unk
        
        #calculate freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    frequencies[word]+=1
                    
                    
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v>self.freq_threshold} 
        
        #limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx =4 for pad, start, end , unk
            
        #create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx+=1
            
    '''
    convert the list of words to a list of corresponding indexes
    '''    
    def numericalize(self, text):
        #tokenize text
        tokenized_text = self.tokenizer(text)
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: #out-of-vocab words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, )
        
        out, _ = self.rnn()
        
rnn = RNN()

In [None]:
criterion = nn.NLLLoss()
learning_rate = 0.001

def train():
    
    
    loss.backward()
    