In [1]:
import numpy as np
import pandas as pd
import os

domains = ['books', 'dvd', 'kitchen', 'electronics', 'movies']

causalm_dir = os.path.expanduser("~") + "/GoogleDrive/AmirNadav/CausaLM"
data_dir = causalm_dir + "/Data/Sentiment/"
ima_data_dir = data_dir + "IMA/"
oob_data_dir = data_dir + "OOB/"

print(os.listdir(ima_data_dir + "books/"))



['features', 'bert-base-cased_epoch_1_metrics.json', 'model (2)', 'bert-base-cased_epoch_0_metrics.json', 'bert-base-cased_epoch_0.json', 'bert-base-cased_epoch_2.json', 'bert-base-cased_epoch_2_metrics.json', 'bert-base-cased_epoch_1.json']


In [12]:
books_raw_data = data_dir + "Raw/" + domains[0] + "/train.csv"

df = pd.read_csv(books_raw_data)
cols = ["id","label","review","no_adj_review"]
df.columns = cols

print(df.head())

     id  label                                             review  \
0  1782      1  A great twist on a well - used plot Trapping a...   
1  1400      1  Georges Lopez is the Star What a splendid teac...   
2   242      0  Very Poor Quality DVD Love the movie , but the...   
3   526      0  Sound Problems I bought this back in October o...   
4  1430      1  Mark Twain Tonight I grew up in Hannibal Mo , ...   

                                       no_adj_review  
0  A twist on a well - used plot Trapping a group...  
1  Georges Lopez is the Star What a teacher is th...  
2  Very Quality DVD Love the movie , but the qual...  
3  Sound Problems I bought this back in October o...  
4  Mark Twain Tonight I grew up in Hannibal Mo , ...  


In [13]:
import torch
from pytorch_transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [82]:
def get_bert_word_embedding(sentence):
    sentence_split = sentence.split(' ')
    len_sentence = 0
    sentence_split_fixed = []
    sentence_fixed = ''
    for word in sentence_split:
        if len_sentence + len(word) < 500:
            sentence_fixed += word + ' ' 
            sentence_split_fixed.append(word)
            len_sentence += len(word) + 1
        else:
            break
    input_ids = torch.tensor(tokenizer.encode(sentence_fixed)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    word_embeddings = []
    for i, word in enumerate(sentence_split_fixed):
        cur_word_embeddings = last_hidden_states[0][i].tolist()
        word_embeddings.append(cur_word_embeddings)
    return word_embeddings[0]

In [56]:
sentence = 'What is the fastest car in the world'
word_embeddings = get_bert_word_embedding(sentence)
print(word_embeddings)

[-0.3656924366950989, -0.09512924402952194, 0.18365707993507385, 0.2723608911037445, 0.5743860006332397, 0.4922480583190918, -0.1531531810760498, 0.07802216708660126, -0.0696071982383728, -0.581312894821167, -0.029274992644786835, -0.381781667470932, 0.13323920965194702, 0.43264666199684143, -0.479882150888443, 0.5546420216560364, -0.3332359492778778, 0.027778565883636475, -0.3555541932582855, -0.5371190309524536, 0.3055431544780731, 0.08803030103445053, -0.019087469205260277, -0.11624623090028763, 0.15585054457187653, -0.01128866896033287, 0.13352501392364502, 0.04659359157085419, -0.16468073427677155, 0.01099395751953125, 0.1667039394378662, -0.3718673288822174, 0.2399977147579193, -0.2195943295955658, -0.40681418776512146, -0.5161871910095215, 0.3323599100112915, -0.18204036355018616, 0.1687781661748886, -0.3986108899116516, -0.22871564328670502, -0.13814063370227814, 0.09727038443088531, -0.18687182664871216, 0.277931809425354, 0.11048579216003418, -0.7545815110206604, 0.1179383546

In [57]:
word_embed_cols = ['embedding_' + str(i) for i in range(1,769)]
all_cols = cols + word_embed_cols

In [83]:
import pandas as pd
import string

# df['bert_representation'] = df['review'].apply(get_bert_word_embedding)
df['bert_representation_no_adj'] = df['no_adj_review'].apply(get_bert_word_embedding)


print(df.head())

     id  label                                             review  \
0  1782      1  A great twist on a well - used plot Trapping a...   
1  1400      1  Georges Lopez is the Star What a splendid teac...   
2   242      0  Very Poor Quality DVD Love the movie , but the...   
3   526      0  Sound Problems I bought this back in October o...   
4  1430      1  Mark Twain Tonight I grew up in Hannibal Mo , ...   

                                       no_adj_review  \
0  A twist on a well - used plot Trapping a group...   
1  Georges Lopez is the Star What a teacher is th...   
2  Very Quality DVD Love the movie , but the qual...   
3  Sound Problems I bought this back in October o...   
4  Mark Twain Tonight I grew up in Hannibal Mo , ...   

                                 bert_representation  \
0  [0.6499998569488525, -0.17038503289222717, 0.6...   
1  [0.11436855792999268, -0.19218140840530396, 0....   
2  [0.38766229152679443, -0.7057222127914429, 0.5...   
3  [0.3676615059375763, 

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [85]:
train['bert_representation'].head()
train['bert_representation_no_adj'].head()

917     [0.7768940925598145, -0.06158752366900444, 0.5...
275     [0.47451451420783997, 0.1284942626953125, 0.18...
533     [0.16240538656711578, -0.03657965362071991, 0....
1046    [0.49242350459098816, 0.07956219464540482, 0.3...
999     [0.4576820731163025, -0.6428127884864807, 0.75...
Name: bert_representation_no_adj, dtype: object

In [89]:
clf = LogisticRegression(random_state=0).fit(train['bert_representation'].values.tolist(), train['label'])

preds = clf.predict(test['bert_representation'].values.tolist())
preds_probs = clf.predict_proba(test['bert_representation'].values.tolist())

print("Accuracy on train set: " + str(clf.score(train['bert_representation'].values.tolist(), train['label'])))
print("Accuracy on test set: " + str(clf.score(test['bert_representation'].values.tolist(), test['label'])))



Accuracy on train set: 0.96875
Accuracy on test set: 0.75390625


In [95]:
clf = LogisticRegression(random_state=0).fit(train['bert_representation_no_adj'].values.tolist(), train['label'])

preds_no_adj = clf.predict(test['bert_representation_no_adj'].values.tolist())
preds_probs_no_adj = clf.predict_proba(test['bert_representation_no_adj'].values.tolist())

print("No ADJ Accuracy on train set: " + str(clf.score(train['bert_representation_no_adj'].values.tolist(), train['label'])))
print("No ADJ Accuracy on test set: " + str(clf.score(test['bert_representation_no_adj'].values.tolist(), test['label'])))



No ADJ Accuracy on train set: 0.9423828125
No ADJ Accuracy on test set: 0.734375


In [97]:
preds_probs_no_adj_ones = [x[1] for x in preds_probs_no_adj]
preds_probs_ones = [x[1] for x in preds_probs]

In [103]:
from statistics import mean

avg_diff = mean(x - y for x, y in zip(preds_probs_no_adj_ones, preds_probs_ones))

print(avg_diff)              
                

0.028477517369342377
