In [1]:
from torch.utils.data import Dataset, DataLoader
import requests
from os.path import isdir
import os
import gzip
import shutil
import json
import pandas as pd

categories = [
    'Books',
    'Electronics',
    'Movies and TV',
    'CDs and Vinyl',
    'Clothing, Shoes and Jewelry',
    'Home and Kitchen',
    'Kindle Store',
    'Sports and Outdoors',
    'Cell Phones and Accessories',
    'Health and Personal Care',
    'Toys and Games',
    'Video Games',
    'Tools and Home Improvement',
    'Beauty',
    'Apps for Android',
    'Office Products',
    'Pet Supplies',
    'Automotive',
    'Grocery and Gourmet Food',
    'Patio, Lawn and Garden',
    'Baby',
    'Digital Music',
    'Musical Instruments',
    'Amazon Instant Video'
]

data_modes = ['5-core', 'ratings only']

class AmazonReveiws(Dataset):
    def __init__(self, category, root_dir='.', data_mode='5-core' ,transform=None, split=(80,10,10)):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        # creates root dir if does not exist
        if isdir(root_dir):
            self.root_dir = root_dir
        else:
            os.makedirs(root_dir)
            self.root_dir = root_dir

        # ensures a valid category is selected
        if category in categories:
            self.category = category.replace(',','').replace(' ', '_')
        else:
            raise ValueError(f'{category} is not a valid category')
        
        # ensures correct data mode is selected
        if data_mode in data_modes:
            self.data_mode = data_mode
        else:
            raise ValueError(f'{data_mode} is not a valid mode \n valid modes are {data_modes[0]} & {data_modes[1]}')
        
        self.__data_dir_base = f'{self.root_dir}/{self.category}/{self.data_mode}'
        
        # checks if data is downloaded
        if not isdir(self.__data_dir_base):
            os.makedirs(self.__data_dir_base)
            self.__download_data()
        

    def __download_data(self):

        if self.data_mode == '5-core':
            uri = f'reviews_{self.category}_5'
        else:
            uri = f'reviews_{self.category}'

        url = f'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/{uri}.json.gz'

        request = requests.get(url)

        with open(f'{self.__data_dir_base}/{uri}.json.gz', 'wb') as file1:
            file1.write(request.content)

        with gzip.open(f'{self.__data_dir_base}/{uri}.json.gz', 'rb') as f_in:
            with open(f'{self.__data_dir_base}/{uri}.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        json_file = open(f'{self.__data_dir_base}/{uri}.json', 'r')

        data = []

        keys = ['helpful','overall','reviewText']

        for line in json_file.readlines():
            data.append(json.loads(line))
        
        data = pd.DataFrame(data)
        data = data[keys]

        data.to_pickle(f'{uri}.pickle',compression={'method':'gzip'})
        print('we hit!!')

if __name__ == '__main__':
    !rm -rf Musical_Instruments
    for i in categories:
        AmazonReveiws(i)
    print('test')

KeyboardInterrupt: ignored

In [26]:
!pip install transformers
!pip install contractions
!pip install n2w
!pip install torchvision 
!pip install sklearn
!pip install matplotlib
!pip install google-colab
!pip install nltk
!pip install string
!pip install contractions
!pip install n2w



Collecting google-colab
  Downloading google-colab-1.0.0.tar.gz (72 kB)
     ---------------------------------------- 72.9/72.9 KB 2.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting google-auth~=1.4.0
  Downloading google_auth-1.4.2-py2.py3-none-any.whl (64 kB)
     ---------------------------------------- 64.2/64.2 KB 3.4 MB/s eta 0:00:00
Collecting ipykernel~=4.6.0
  Downloading ipykernel-4.6.1-py3-none-any.whl (104 kB)
     -------------------------------------- 104.5/104.5 KB 5.9 MB/s eta 0:00:00
Collecting ipython~=5.5.0
  Downloading ipython-5.5.0-py3-none-any.whl (758 kB)
     ------------------------------------- 758.9/758.9 KB 15.9 MB/s eta 0:00:00
Collecting notebook~=5.2.0
  Downloading notebook-5.2.2-py2.py3-none-any.whl (8.0 MB)
     ---------------------------------------- 8.0/8.0 MB 14.2 MB/s eta 0:00:00
Collecting six~=1.12.0
  Downloading six-1.12.0-py2.py3-none-any.whl (10 kB)
Collectin

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Aymane\\AppData\\Local\\Programs\\Python\\Python39\\Lib\\site-packages\\~ornado\\speedups.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.





ERROR: Could not find a version that satisfies the requirement string (from versions: none)
ERROR: No matching distribution found for string




In [27]:
## mount onto Shared Google Drive to load data

import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertModel
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
from google.colab import drive
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
import contractions
import n2w

nltk.download('stopwords')
nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google'

In [2]:
## functions for loading and storing files to/from google drive

def pickle_encoding(data, fname, dir='/content/gdrive/Shareddrives/EC523_project/encodings/'):
    in_data = {key : value.to('cpu')  for (key,value) in data.items()}

    with open(dir + fname, 'wb') as handle:
        pickle.dump(in_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def unpickle_encoding(fname, dir='/content/gdrive/Shareddrives/EC523_project/encodings/'):
    with open(dir + fname, 'rb') as handle:
        data = pickle.load(handle)
    return data
  
def pickle_reviews(data, fname, dir='/content/gdrive/Shareddrives/EC523_project/ratings/'):
    with open(dir + fname, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def unpickle_reviews(fname, dir='/content/gdrive/Shareddrives/EC523_project/encodings/'):
    with open(dir + fname, 'rb') as handle:
        data = pickle.load(handle)
    return data

def accuracy(net, input_id, mask,labels):
    j = len(labels)

    guesses = net(input_id, mask)
    guesses = torch.argmax(guesses,dim=1)
    current_real = torch.tensor(labels)-1
    running_acc = torch.sum(current_real==guesses)


    print(running_acc)
    print(j)

    print('Accuracy: %d %%' % ((running_acc / j) * 100.0))

In [24]:
## load data set

# data = unpickle_encoding(fname='reviews_Movies_and_TV_5.pkl', dir='/content/gdrive/Shareddrives/EC523_project/Data/')
#data = unpickle_encoding(fname='reviews_CDs_and_Vinyl_5.pkl', dir='/content/gdrive/Shareddrives/EC523_project/Data/')
data = unpickle_encoding(fname='reviews_Electronics_5.pkl', dir='/content/gdrive/Shareddrives/EC523_project/Data/')[:][:250000]

In [25]:
## split into sentences and ratings
print(data)
# print(torch.sum(torch.tensor(labels)==5))
# print(torch.sum(torch.tensor(labels)==4))
# print(torch.sum(torch.tensor(labels)==3))
# print(torch.sum(torch.tensor(labels)==2))
# print(torch.sum(torch.tensor(labels)==1))

         helpful  overall                                         reviewText
0         [0, 0]        5  We got this GPS for my husband who is an (OTR)...
1       [12, 15]        1  I'm a professional OTR truck driver, and I bou...
2       [43, 45]        3  Well, what can I say.  I've had this unit in m...
3        [9, 10]        2  Not going to write a long review, even thought...
4         [0, 0]        1  I've had mine for a year and here's what we go...
...          ...      ...                                                ...
249995    [0, 0]        5  I've owned this unit for 18 months and it work...
249996  [26, 52]        1  This product should work fine for anyone who h...
249997    [1, 2]        4  I hooked up the Samsung HDTV tuner to my exist...
249998    [0, 0]        5  I got this receiver because it sends the signa...
249999    [1, 1]        1  Everything was great until it started losing c...

[250000 rows x 3 columns]


In [26]:
def expand_contractions(text):
  expanded_words = []   
  for word in text.split():
    expanded_words.append(contractions.fix(word))  
    
  expanded_text = ' '.join(expanded_words)
  return " ".join(expanded_words)

def remove_punctuation(text):
  text = text.translate(str.maketrans('','',string.punctuation))
  return text

def lower_case(text):
  return text.lower()

def remove_stop_words(text):
  stop_words = set(stopwords.words('english'))
  
  word_tokens = word_tokenize(text)
  
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  
  filtered_sentence = []
  
  for w in word_tokens:
      if w not in stop_words:
          filtered_sentence.append(w)

  return " ".join(filtered_sentence)

def num2word(text):
  for i in text.split():
    if i.isdigit():
      text = text.replace(i, n2w.convert(i))
  return text

def preprocessing(text):
  text = lower_case(text)
  text = expand_contractions(text)
  text = remove_punctuation(text)
  text = num2word(text)

  # Try both with and without stop words and evaluate performance difference
  # BERT performs better in context based textual input so keeping stop words might be helpful

  text = remove_stop_words(text)

  return text

text = "Hello my name is Aymane, and I like to sleep. I'll go to the movies tomorrow, because I want to see a film I haven't seen before. I won't see it tonight at 12"
preprocessing(text)

'hello name aymane like sleep go movies tomorrow want see film seen see tonight twelve'

In [27]:
for i in range(len(data)):
  data['reviewText'][i] = preprocessing(data['reviewText'][i])
  
labels = np.array(data['overall'])
sentences = np.array(data['reviewText'])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documenta

In [29]:
print(data)

         helpful  overall                                         reviewText
0         [0, 0]        5  got gps husband otr road trucker impressed shi...
1       [12, 15]        1  professional otr truck driver bought tnd seven...
2       [43, 45]        3  well say unit truck four days prior garmin 755...
3        [9, 10]        2  going write long review even thought unit dese...
4         [0, 0]        1  mine year got tries route non truck routes tel...
...          ...      ...                                                ...
249995    [0, 0]        5  owned unit eighteen months works perfectly goo...
249996  [26, 52]        1  product work fine anyone hooks using component...
249997    [1, 2]        4  hooked samsung hdtv tuner existing antenna cou...
249998    [0, 0]        5  got receiver sends signal tv hd older hdtv cam...
249999    [1, 1]        1  everything great started losing channels would...

[250000 rows x 3 columns]


In [31]:
## split to training, validation, and test (90,5,5) (but after unbiasing training data it is around 60 20 20)
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.1, random_state=42, stratify=labels)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
print(y_train.shape, y_val.shape, y_test.shape)
print(X_train.shape)

## getting unbiased data for training
training_points_per_category = 11000                       ## 50K for movies, 40K for CDs, 50K for electronicd
rev1 = X_train[y_train==1] [:training_points_per_category]
rev2 = X_train[y_train==2] [:training_points_per_category]
rev3 = X_train[y_train==3] [:training_points_per_category]
rev4 = X_train[y_train==4] [:training_points_per_category]
rev5 = X_train[y_train==5] [:training_points_per_category]

X_train = np.concatenate((rev1, rev2, rev3, rev4, rev5), axis=0)
base = torch.zeros(training_points_per_category)
y_train = torch.cat((base,base+1,base+2,base+3,base+4))

# perm = torch.randperm(5*training_points_per_category)
# X_train = X_train[perm]
# y_train = y_train[perm]
print(y_train.shape, y_val.shape, y_test.shape)

(225000,) (12500,) (12500,)
(225000,)
torch.Size([55000]) (12500,) (12500,)


In [32]:
## tokenizer
encoding = BertTokenizer.from_pretrained('bert-base-cased')

# encoding = BertTokenizerFast()

In [None]:
#bert_input = encoding(X_train[0],padding='max_length', max_length = 512, 
#                       truncation=True, return_tensors="pt").to(device)
#print(X_train[0])
#
#example_text = encoding.decode(bert_input.input_ids[0])
#print(example_text)
#print(bert_input)

In [34]:
## encode training, validation, and test data
num_of_words = 200

train_encoding = encoding(X_train.tolist(),padding='max_length', max_length = num_of_words, 
                       truncation=True, return_tensors="pt")
val_encoding = encoding(X_val.tolist(),padding='max_length', max_length = num_of_words, 
                       truncation=True, return_tensors="pt")
test_encoding = encoding(X_test.tolist(),padding='max_length', max_length = num_of_words, 
                       truncation=True, return_tensors="pt")

In [38]:
## store ratings to drive

dir='/content/gdrive/Shareddrives/EC523_project/ratings/'
# fname = 'reviews_Movies_and_TV_5.pkl'
#fname = 'reviews_CDs_and_Vinyl_5.pkl'
fname = 'reviews_Electronics_5.pkl'

pickle_reviews(y_train, 'training_' + fname, dir='/content/gdrive/Shareddrives/EC523_project/ratings/') 
pickle_reviews(y_val, 'testing_' + fname, dir='/content/gdrive/Shareddrives/EC523_project/ratings/') 
pickle_reviews(y_test, 'validating_' + fname, dir='/content/gdrive/Shareddrives/EC523_project/ratings/') 

In [39]:
## store encodings to google drive

# filename = 'reviews_Movies_and_TV_5.pkl'
#filename = 'reviews_CDs_and_Vinyl_5.pkl'
filename = 'reviews_Electronics_5.pkl'
pickle_encoding(train_encoding,fname='training__' + filename,dir='/content/gdrive/Shareddrives/EC523_project/encodings/')
pickle_encoding(val_encoding,fname='validating__' + filename, dir='/content/gdrive/Shareddrives/EC523_project/encodings/')
pickle_encoding(test_encoding,fname='testing__' + filename, dir='/content/gdrive/Shareddrives/EC523_project/encodings/')

In [40]:
## load previously saved training encodings and labels

# filename = 'reviews_Movies_and_TV_5.pkl'
#filename = 'reviews_CDs_and_Vinyl_5.pkl'
filename = 'reviews_Electronics_5.pkl'

train_encoding = unpickle_encoding(fname='training__' + filename, dir='/content/gdrive/Shareddrives/EC523_project/encodings/')
train_y = unpickle_reviews(fname='training_' + filename, dir='/content/gdrive/Shareddrives/EC523_project/ratings/')
training_points_per_category = 50000
print(train_encoding['input_ids'].shape)
#print(train_encoding)

torch.Size([55000, 200])


In [41]:
## load bert model
bert = BertModel.from_pretrained('bert-base-cased').to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
## Feed encodings through BERT model
x = torch.zeros(5*training_points_per_category,768)
a = torch.zeros(training_points_per_category//5,200,768)
#a, x = bert(input_ids=train_encoding['input_ids'].to(device), attention_mask=train_encoding['attention_mask'].to(device),return_dict=False)
import torch, gc

batch_size = 15

for i in range(5*training_points_per_category//batch_size//5):
  j = list(range(batch_size*i,batch_size*(i+1)))
  a_temp, x_temp = bert(input_ids=train_encoding['input_ids'][j,:].to(device), attention_mask=train_encoding['attention_mask'][j,:].to(device),return_dict=False)
  
  x[j,:] = x_temp.clone().detach().cpu()
  a[j,:,:] = a_temp.clone().detach().cpu()
  print(i)
  #gc.collect()
  with torch.no_grad():
    torch.cuda.empty_cache()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
## store outputs of BERT to drive
dir='/content/gdrive/Shareddrives/EC523_project/BERT/'
fname = 'reviews_Movies_and_TV_5.pkl'
#fname = 'reviews_CDs_and_Vinyl_5.pkl'
#fname = 'reviews_Electronics_5.pkl

pickle_reviews(x, 'BERT_training_' + fname, dir=dir) 

In [None]:
## load BERT output and labels
dir='/content/gdrive/Shareddrives/EC523_project/BERT/'
filename = 'reviews_Movies_and_TV_5.pkl'

X_train = unpickle_reviews(fname='BERT_training_' + filename, dir=dir)
print(X_train.shape)

y_train = unpickle_reviews(fname='training_' + filename, dir='/content/gdrive/Shareddrives/EC523_project/ratings/')
training_points_per_category = 50000
print(y_train.shape)

In [None]:
## Make the training set smaller
print(X_train[49999,0])
print(X_train[50000,0])
X_train = X_train[:50000,:].to(device)
y_train = y_train[:50000].to(device)
print(X_train.shape)
print(y_train.shape)

In [None]:
#word_length = list(map(
#    lambda x : len(x.split(" ")),
#    X_train
#))
#
#word_std  = np.std(word_length)
#word_mean = np.mean(word_length)
#
#word_skew = skew(word_length)
#word_kurt = kurtosis(word_length)
#
#plt.plot(sorted(word_length))
#plt.grid()
#print(word_std,word_mean)
#print(word_skew,word_kurt)

In [None]:
######### PYTORCH ###############

In [None]:
class Net(nn.Module):
    def __init__(self, output_dim=5,dropout=0.5):
        super(Net, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(768, 600)
        self.fc2 = nn.Linear(600, 500)
        self.fc3 = nn.Linear(500, 300)
        self.fc4 = nn.Linear(300, 50)
        self.fc5 = nn.Linear(50, output_dim)

    #def forward(self, input_id, mask):
    def forward(self, X_train):
        x = self.dropout(X_train)
        x = F.sigmoid(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        x = F.sigmoid(self.fc4(x))
        x = F.softmax(self.fc5(x))
        return x

net = Net(output_dim=5,dropout=0.2).to(device)

In [None]:
criterion = torch.nn.MSELoss().to(device)
#criterion = torch.nn.MSELoss().to(device)
optimizer = torch.optim.Adam(net.parameters(),lr=1e-3)

In [None]:
X_train = X_train.to(device)
y_train = y_train.long().to(device)
#y_training = torch.from_numpy(y_train)
#train_y = train_y.long()
#train_y = train_y.float()
#y_training = y_training-1

In [None]:
batch_size = 50
indecies = torch.tensor(range(batch_size))

losses = []
for epoch in range(20):
    running_loss = 0.0
    for i in range(len(y_train)//batch_size):
    #for i in range(1):
        select = batch_size * i + indecies

        outputs = net(X_train[select,:])

        loss = criterion(outputs,F.one_hot(y_train[select]).float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
        # if i+1 % 10 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss ))
        
        losses.append(running_loss)
        running_loss = 0.0

print('Finished Training')
plt.plot(losses)

In [None]:
a = net(X_train)
b = torch.argmax(a,dim=1)
correct = sum(y_train==b)
correct / 50000

In [None]:
a = net(X_train[0:4,:])
print(a)
print(y_train[0:4])

In [None]:
  # running_acc = 0.0
# j = len(y_test)
# for i in range(len(y_test)):
#     guesses = net(test_encoding['input_ids'][i].type(torch.float))
#     print(guesses)
#     guesses = F.one_hot(torch.argmax(guesses),num_classes=5).type(torch.float)
#     current_real = F.one_hot(torch.tensor([y_test[i]], dtype=torch.long)-1,num_classes=5)
#     running_acc = running_acc + torch.all(torch.eq(torch.round(guesses),current_real))

# guesses = net(test_encoding['input_ids'].type(torch.float))
# print(guesses)
# guesses = torch.argmax(guesses,dim=1)
# current_real = torch.tensor(y_test)-1
# print(current_real==guesses)
# running_acc = torch.sum(current_real==guesses)


# print(running_acc)
# print(j)

# print('Accuracy: %d %%' % ((running_acc / j) * 100.0))



selects = torch.randint(0,1026,(20,))
accuracy(net,test_encoding['input_ids'][selects], test_encoding['attention_mask'][selects] ,y_test[selects])
accuracy(net, train_encoding['input_ids'][selects], train_encoding['attention_mask'][selects] ,y_test[selects])

In [None]:
print(torch.sum(torch.tensor(y_test)==5))
print(torch.sum(torch.tensor(y_test)==4))
print(torch.sum(torch.tensor(y_test)==3))
print(torch.sum(torch.tensor(y_test)==2))
print(torch.sum(torch.tensor(y_test)==1))