In [1]:
"""
Name: Allegra Marsiglio
Class: CS688 Web Mining and Graph Analytics
Semester: SPRING 23
Project: TERM PROJECT
"""

import pandas as pd
import numpy as np

# LOAD DATA 
r1 = pd.read_csv("reviews1.csv")
r2 = pd.read_csv("reviews2.csv")
r3 = pd.read_csv("reviews3.csv")
r4 = pd.read_csv("reviews4.csv")
r5 = pd.read_csv("reviews5.csv")
r6 = pd.read_csv("reviews6.csv")

r1['label'] = np.where(r1['review rating'] < 3, 'negative', 'positive')


In [2]:
# DATA PREPROCESSING

import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Join the words back into a string
    return ' '.join(words)

r1['review text'] = r1['review text'].astype(str)
r1['review text'] = r1['review text'].apply(preprocess_text)

# Convert the labels to numerical values
label_map = {'negative': 0, 'positive': 1}
r1['label'] = r1['label'].map(label_map)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# SPLIT r1 IN TRAINING AND TEST

from sklearn.model_selection import train_test_split

r1_train, r1_test= train_test_split(r1, test_size=0.30, random_state=42)



In [4]:
# LOAD the PRE-TRAINED BERT MODEL and TOKENIZER

!pip install transformers
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import torch

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                              num_labels=2,
                                                              )


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [5]:
# PREPARE THE TRAINING DATA

# Tokenize the text and create input IDs and attention masks
train_encodings = tokenizer(r1_train['review text'].tolist(), truncation=True, 
                            padding=True, return_tensors='pt')
train_labels = torch.tensor(r1_train['label'].tolist())

test_encodings = tokenizer(r1_test['review text'].tolist(), truncation=True, 
                           padding=True, return_tensors='pt')
test_labels = torch.tensor(r1_test['label'].tolist())

# converting the labels into a one-hot encoded format
!pip install tensorflow.keras.utils
from tensorflow.keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=2)
test_labels = to_categorical(test_labels, num_classes=2)

# Create datasets
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    train_labels))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                   test_labels))

In [6]:
# MODEL TRAINING AND OPTIMIZATION

import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)

def categorical_crossentropy(y_true, y_pred):
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=True)
    return tf.reduce_mean(loss)

model.compile(optimizer=optimizer, loss=categorical_crossentropy, metrics=['accuracy'])

model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=test_dataset.shuffle(100).batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3b0c55d420>

In [7]:
# APPLY BERT TO OTHER FILES (1/4)

# tokenize
encodings2 = tokenizer(r2['review text'].astype(str).tolist(), truncation=True, 
                       padding=True, return_tensors='pt')
encodings3 = tokenizer(r3['review text'].astype(str).tolist(), truncation=True, 
                       padding=True, return_tensors='pt')
encodings4 = tokenizer(r4['review text'].astype(str).tolist(), truncation=True, 
                       padding=True, return_tensors='pt')
encodings5 = tokenizer(r5['review text'].astype(str).tolist(), truncation=True, 
                       padding=True, return_tensors='pt')
encodings6 = tokenizer(r6['review text'].astype(str).tolist(), truncation=True, 
                       padding=True, return_tensors='pt')


In [8]:
# APPLY BERT TO OTHER FILES (2/4)

# Convert the tokenized reviews into a TensorFlow dataset
dataset2 = tf.data.Dataset.from_tensor_slices(dict(encodings2))
dataset3 = tf.data.Dataset.from_tensor_slices(dict(encodings3))
dataset4 = tf.data.Dataset.from_tensor_slices(dict(encodings4))
dataset5 = tf.data.Dataset.from_tensor_slices(dict(encodings5))
dataset6 = tf.data.Dataset.from_tensor_slices(dict(encodings6))


In [9]:
# APPLY BERT TO OTHER FILES (3/4)

# Use the model.predict method to predict the labels of the reviews
output2 = model.predict(dataset2.batch(16))
output3 = model.predict(dataset3.batch(16))
output4 = model.predict(dataset4.batch(16))
output5 = model.predict(dataset5.batch(16))
output6 = model.predict(dataset6.batch(16))

preds2 = output2['logits']
preds3 = output3['logits']
preds4 = output4['logits']
preds5 = output5['logits']
preds6 = output6['logits']



In [10]:
# APPLY BERT TO OTHER FILES (4/4)

# Convert the predicted logits into class labels
labels2 = np.argmax(preds2, axis=1)
labels3 = np.argmax(preds3, axis=1)
labels4 = np.argmax(preds4, axis=1)
labels5 = np.argmax(preds5, axis=1)
labels6 = np.argmax(preds6, axis=1)

In [11]:
encodings1 = tokenizer(r1['review text'].astype(str).tolist(), truncation=True, 
                       padding=True, return_tensors='pt')

# Convert the tokenized reviews into a TensorFlow dataset
dataset1 = tf.data.Dataset.from_tensor_slices(dict(encodings1))

# Use the model.predict method to predict the labels of the reviews
output1 = model.predict(dataset1.batch(16))

preds1 = output2['logits']

# Convert the predicted logits into class labels
labels1 = np.argmax(preds2, axis=1)



In [12]:
l1_pos = len(labels1[labels1==1])/len(labels1)
l1_neg = len(labels1[labels1==0])/len(labels1)

print('Percentage of positive in l1:', round(l1_pos*100, 2), '%')
print('Percentage of negative in l1:', round(l1_neg*100, 2), '%', '\n')


l2_pos = len(labels2[labels2==1])/len(labels2)
l2_neg = len(labels2[labels2==0])/len(labels2)

print('Percentage of positive in l2:', round(l2_pos*100, 2), '%')
print('Percentage of negative in l2:', round(l2_neg*100, 2), '%', '\n')

l3_pos = len(labels3[labels3==1])/len(labels3)
l3_neg = len(labels3[labels3==0])/len(labels3)

print('Percentage of positive in l3:', round(l3_pos*100, 2), '%')
print('Percentage of negative in l3:', round(l3_neg*100, 2), '%', '\n')

l4_pos = len(labels4[labels4==1])/len(labels4)
l4_neg = len(labels4[labels4==0])/len(labels4)

print('Percentage of positive in l4:', round(l4_pos*100, 2), '%')
print('Percentage of negative in l4:', round(l4_neg*100, 2), '%', '\n')

l5_pos = len(labels5[labels5==1])/len(labels5)
l5_neg = len(labels5[labels5==0])/len(labels5)

print('Percentage of positive in l5:', round(l5_pos*100, 2), '%')
print('Percentage of negative in l5:', round(l5_neg*100, 2), '%', '\n')

l6_pos = len(labels6[labels6==1])/len(labels6)
l6_neg = len(labels6[labels6==0])/len(labels6)

print('Percentage of positive in l6:', round(l6_pos*100, 2), '%')
print('Percentage of negative in l6:', round(l6_neg*100, 2), '%', '\n')

Percentage of positive in l1: 78.95 %
Percentage of negative in l1: 21.05 % 

Percentage of positive in l2: 78.95 %
Percentage of negative in l2: 21.05 % 

Percentage of positive in l3: 79.48 %
Percentage of negative in l3: 20.52 % 

Percentage of positive in l4: 88.56 %
Percentage of negative in l4: 11.44 % 

Percentage of positive in l5: 76.05 %
Percentage of negative in l5: 23.95 % 

Percentage of positive in l6: 86.28 %
Percentage of negative in l6: 13.72 % 



In [13]:
best = max(l1_pos, l2_pos, l3_pos, l4_pos, l5_pos, l6_pos)

if best == l1_pos:
  print('The best product is product 1.')
elif best == l2_pos:
  print('The best product is product 2.')
elif best == l3_pos:
  print('The best product is product 3.')
elif best == l4_pos:
  print('The best product is product 4.')
elif best == l5_pos:
  print('The best product is product 5.')
elif best == l6_pos:
  print('The best product is product 6.')


The best product is product 4.
