In [104]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

### Loading Data

In [40]:
df_train=pd.read_csv("traindata.csv",sep='\t',header=None)
df_train.columns=["polarity","aspect_category","target_term","character_offset","sentence"]
df_train.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...


In [45]:
df_dev=pd.read_csv("devdata.csv",sep='\t',header=None)
df_dev.columns=["polarity","aspect_category","target_term","character_offset","sentence"]
df_dev.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ..."
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.


In [71]:
df_train["label"]=df_train["polarity"].apply(lambda x: 1 if x=="positive" else (0 if x=="neutral" else -1))
df_dev["label"]=df_dev["polarity"].apply(lambda x: 1 if x=="positive" else (0 if x=="neutral" else -1))
df_dev.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ...",1
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.,-1


### Word2vec

In [23]:
PATH_TO_DATA = Path('C:/Users/Armand/Desktop/3A/Deep Learning/nlp_project/nlp_project/')
# Download word vectors, might take a few minutes and about ~3GB of storage space
en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz'
if not en_embeddings_path.exists():
    urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path)
fr_embeddings_path = PATH_TO_DATA / 'cc.fr.300.vec.gz'
if not fr_embeddings_path.exists():
    urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz', fr_embeddings_path)

In [20]:
from collections import defaultdict
import gzip
import numpy as np
from pathlib import Path
from urllib.request import urlretrieve

import difflib
import re
import numpy as n

In [21]:
class Word2Vec():

    def __init__(self, filepath, vocab_size=50000):
        self.words, self.embeddings = self.load_wordvec(filepath, vocab_size)
        # Mappings for O(1) retrieval:
        self.word2id = {word: idx for idx, word in enumerate(self.words)}
        self.id2word = {idx: word for idx, word in enumerate(self.words)}
    
    def load_wordvec(self, filepath, vocab_size):
        assert str(filepath).endswith('.gz')
        words = []
        embeddings = []
        with gzip.open(filepath, 'rt',encoding="utf8") as f:  # Read compressed file directly
            next(f)  # Skip header
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                words.append(word)
                embeddings.append(np.fromstring(vec, sep=' '))
                if i == (vocab_size - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(words)))
        return words, np.vstack(embeddings)
    
    def encode(self, word):
        # Returns the 1D embedding of a given word
        #return self.embeddings[self.word2id[word]]
        try:
            i = self.word2id[word]
            return self.embeddings[i]
        except:
            try:
                word = difflib.get_close_matches(word, self.words)[0]
                i = self.word2id[word]
            except:
                return 0
        return self.embeddings[i]
    
    def score(self, word1, word2):
        # Return the cosine similarity: use np.dot & np.linalg.norm
        code1=self.encode(word1)
        code2=self.encode(word2)
        return np.dot(code1,code2)/(np.linalg.norm(code1)*np.linalg.norm(code2))

In [25]:
class BagOfWords():
    def __init__(self, word2vec):
        self.word2vec = word2vec
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        # -> idf = {word: idf_value, ...}
        idf={}
        N=len(sentences)
        
        # get number of documents containing each word
        for sentence in sentences:
            wordsList=re.sub("[^\w]", " ",sentence).split()
            for word in set(wordsList):
                idf[word]=idf.get(word, 0)+1
                
        #transform to get idf value of each word       
        for word in idf:
            idf[word]=np.log10(N/idf[word])
        return idf
        
    
    def encode(self, sentence, idf=None):
        # Takes a sentence as input, returns the sentence embedding
        wordsList=re.sub("[^\w]", " ",sentence).split()
        wordsVectors=[self.word2vec.encode(word) for word in wordsList]
        if idf is None:
            # mean of word vectors
            return np.mean(wordsVectors,axis=0)
        else:
            # idf-weighted mean of word vectors
            weightedMean=0
            sumIdf=0
            for i,word in enumerate(wordsList):
                weightedMean+=idf.get(word,0)*wordsVectors[i]
                sumIdf+=idf.get(word,0)
            weightedMean=weightedMean/sumIdf
            return weightedMean
                                

    def score(self, sentence1, sentence2, idf=None):
        # cosine similarity: use np.dot & np.linalg.norm 
        code1=self.encode(sentence1,idf)
        code2=self.encode(sentence2,idf)
        return np.dot(code1,code2)/(np.linalg.norm(code1)*np.linalg.norm(code2))

In [26]:
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
sentence2vec = BagOfWords(word2vec)

Loaded 50000 pretrained word vectors


### First Model

#### Encode full sentence with word2vec then classification (logreg)

In [42]:
a=sentence2vec.encode(df_train["sentence"][1])

In [44]:
train_sentences_emb=[sentence2vec.encode(df_train["sentence"][i]) for i in range(len(df_train["sentence"]))]

In [46]:
dev_sentences_emb=[sentence2vec.encode(df_dev["sentence"][i]) for i in range(len(df_dev["sentence"]))]

In [55]:
from sklearn.linear_model import LogisticRegression

train_acc,dev_acc=[],[]
pen_values = 10.0**(np.arange(-2,2,0.5))

for pen in pen_values:
    logReg = LogisticRegression(penalty="l2",C = pen, multi_class='auto',solver='newton-cg')
    logReg.fit(train_sentences_emb, df_train["label"])
    train_acc.append(logReg.score(train_sentences_emb, df_train["label"]))
    dev_acc.append(logReg.score(dev_sentences_emb, df_dev["label"]))

best_pen=pen_values[np.argmax(dev_acc)]
best_train_acc=train_acc[np.argmax(dev_acc)]
best_dev_acc=max(dev_acc)

print("Results for mean BoW: \n","Best value for the penalty:",best_pen,'\n Dev accuracy:',best_dev_acc,'\n Train accuracy:',best_train_acc)

Results for mean BoW: 
 Best value for the penalty: 10.0 
 Dev accuracy: 0.7686170212765957 
 Train accuracy: 0.8416500332667998


In [57]:
logReg = LogisticRegression(penalty="l2",C = 10, multi_class='auto',solver='newton-cg')

In [69]:
logReg.fit(train_sentences_emb, df_train["label"])
logReg.predict(dev_sentences_emb);

### 2nd Model

#### Same but remove target term and stopwords from sentence

In [196]:
sentence_red=[0]*len(df_train)
for i in range(len(df_train)):
    sentence_red[i]=df_train["sentence"][i][:int(df_train["character_offset"][i].split(":")[0])]+df_train["sentence"][i][int(df_train["character_offset"][i].split(":")[1]):]
df_train["sentence_red"]=sentence_red

sentence_red=[0]*len(df_dev)
for i in range(len(df_dev)):
    sentence_red[i]=df_dev["sentence"][i][:int(df_dev["character_offset"][i].split(":")[0])]+df_dev["sentence"][i][int(df_dev["character_offset"][i].split(":")[1]):]
df_dev["sentence_red"]=sentence_red

In [197]:
df_train.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label,sentence_red
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,1,"short and sweet – is great:it's romantic,cozy..."
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,1,This quaint and romantic is at the top of my ...


In [198]:
stop_words = set(stopwords.words('english')) 
stop_words.remove("not")
stop_words.remove("no")

def remove_stopwords(sentence):
    sentence = sentence.lower()
    sentence_tok = word_tokenize(sentence)
    sentence_f = ""
    for i in range(len(sentence_tok)):
        w=sentence_tok[i]
        if w not in stop_words:
            if i==len(sentence_tok)-1:
                sentence_f+=w
            else:
                sentence_f+=w+" "
    if len(sentence_f)<2:
        sentence_f = sentence
    return sentence_f

In [199]:
df_train["sentence_red"]=df_train["sentence_red"].apply(lambda x:remove_stopwords(x))
df_dev["sentence_red"]=df_dev["sentence_red"].apply(lambda x:remove_stopwords(x))
df_train.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label,sentence_red
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,1,"short sweet – great : 's romantic , cozy priva..."
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,1,quaint romantic top manhattan restaurant list .


In [201]:
train_sentences_emb2=[sentence2vec.encode(df_train["sentence_red"][i]) for i in range(len(df_train["sentence_red"]))]

In [202]:
dev_sentences_emb2=[sentence2vec.encode(df_dev["sentence_red"][i]) for i in range(len(df_dev["sentence_red"]))]

In [203]:
from sklearn.linear_model import LogisticRegression

train_acc,dev_acc=[],[]
pen_values = 10.0**(np.arange(-2,2,0.5))

for pen in pen_values:
    logReg = LogisticRegression(penalty="l2",C = pen, multi_class='auto',solver='newton-cg')
    logReg.fit(train_sentences_emb2, df_train["label"])
    train_acc.append(logReg.score(train_sentences_emb2, df_train["label"]))
    dev_acc.append(logReg.score(dev_sentences_emb2, df_dev["label"]))

best_pen=pen_values[np.argmax(dev_acc)]
best_train_acc=train_acc[np.argmax(dev_acc)]
best_dev_acc=max(dev_acc)

print("Results for mean BoW: \n","Best value for the penalty:",best_pen,'\n Dev accuracy:',best_dev_acc,'\n Train accuracy:',best_train_acc)

Results for mean BoW: 
 Best value for the penalty: 10.0 
 Dev accuracy: 0.7978723404255319 
 Train accuracy: 0.8642714570858283
