In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from tqdm import tqdm

#Data Loading

In [3]:
#Reading data set using pandas
data = pd.read_csv("/content/drive/MyDrive/Deep Learning/Reviews.csv")

In [4]:
#first five rows
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
data['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [6]:
#Sorting data according to ProductId in ascending order
sorted_data=data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [7]:
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(393933, 10)

In [8]:
#final = final.sample(frac = 0.5)

#Seperating Positive and Negative Reviews

In [9]:
data_positive = final[final.Score > 3]
data_negative = final[final.Score < 3]

In [10]:
#data_positive = data_positive['Text']

In [11]:
#type(data_positive)

In [12]:
data_positive = data_positive [0:2000]
data_negative = data_negative [0:2000]

In [13]:
data_positive = data_positive['Text']
data_negative = data_negative['Text']

#Removing Punctuation and Stop words

In [14]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [15]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [16]:
from bs4 import BeautifulSoup
from tqdm import tqdm
# tqdm is for printing the status bar
def PreProcessing(df):
  preprocessed_reviews = []
  for sentance in tqdm(df.values):
      # remove urls from text python: https://stackoverflow.com/a/40823105/4084039
      sentance = re.sub(r"http\S+", "", sentance)
      # https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element
      sentance = BeautifulSoup(sentance, 'lxml').get_text()
      sentance = decontracted(sentance)
      #remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
      sentance = re.sub("\S*\d\S*", "", sentance).strip()
      #remove spacial character: https://stackoverflow.com/a/5843547/4084039
      sentance = re.sub('[^A-Za-z]+', ' ', sentance)
      # https://gist.github.com/sebleier/554280
      sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
      preprocessed_reviews.append(sentance.strip())
  return preprocessed_reviews

In [17]:
preprocessed_positive_reviews = PreProcessing(data_positive)
preprocessed_negative_reviews = PreProcessing(data_negative)

100%|██████████| 2000/2000 [00:01<00:00, 1906.47it/s]
100%|██████████| 2000/2000 [00:00<00:00, 2067.77it/s]


In [18]:
def DataJoin(input_list):
  lines = []

  for i in input_list:
      lines.append(i)
      
  data = ""

  for i in lines:
      data = ' '. join(lines)
  z = []

  for i in data.split():
    #if i not in z:
    z.append(i)
          
  data = ' '.join(z)
  return data

In [19]:
positive_data_joined = DataJoin(preprocessed_positive_reviews)
negative_data_joined = DataJoin(preprocessed_negative_reviews)

In [20]:
positive_data_joined = positive_data_joined 
negative_data_joined = negative_data_joined

In [21]:
positive_data_joined



#Writing the Reviews into Text files

In [22]:
file = open("/content/drive/MyDrive/Deep Learning/Positive_Reviews.txt", "w+")
#content_positive = str(data_positive_array)
file.write(positive_data_joined)
file.close()

In [23]:
file = open("/content/drive/MyDrive/Deep Learning/Negative_Reviews.txt", "w+")
#content_negative = str(data_negative_array)
file.write(negative_data_joined)
file.close()

In [24]:
pip install happytransformer

Collecting happytransformer
  Downloading happytransformer-2.4.0-py3-none-any.whl (45 kB)
[?25l[K     |███████▏                        | 10 kB 23.7 MB/s eta 0:00:01[K     |██████████████▍                 | 20 kB 27.1 MB/s eta 0:00:01[K     |█████████████████████▋          | 30 kB 14.2 MB/s eta 0:00:01[K     |████████████████████████████▉   | 40 kB 10.1 MB/s eta 0:00:01[K     |████████████████████████████████| 45 kB 1.8 MB/s 
Collecting datasets>=1.6.0
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 9.0 MB/s 
[?25hCollecting transformers>=4.4.0
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.2 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-

#Training Happy Transformer

In [25]:
from happytransformer import HappyWordPrediction
happy_wp_positive = HappyWordPrediction()
happy_wp_negative = HappyWordPrediction()
happy_wp_positive.train("/content/drive/MyDrive/Deep Learning/Positive_Reviews.txt")

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

12/09/2021 20:13:08 - INFO - happytransformer.happy_transformer -   Using model: cuda
12/09/2021 20:13:23 - INFO - happytransformer.happy_transformer -   Using model: cuda
12/09/2021 20:13:23 - INFO - happytransformer.happy_transformer -   Preprocessing dataset...


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-588d8d5b601d34fa/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-588d8d5b601d34fa/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (108854 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:13:25 - INFO - happytransformer.happy_transformer -   Training...
***** Running training *****
  Num examples = 212
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 636


Step,Training Loss
500,5.4002


Saving model checkpoint to /tmp/tmpj9g_toay/checkpoint-500
Configuration saved in /tmp/tmpj9g_toay/checkpoint-500/config.json
Model weights saved in /tmp/tmpj9g_toay/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmpj9g_toay/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmpj9g_toay/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [26]:
happy_wp_negative.train("/content/drive/MyDrive/Deep Learning/Negative_Reviews.txt")

12/09/2021 20:15:24 - INFO - happytransformer.happy_transformer -   Preprocessing dataset...


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-a70679afb7bb1aec/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-a70679afb7bb1aec/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (98106 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:15:25 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 191
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 573


Step,Training Loss
500,5.3752


Saving model checkpoint to /tmp/tmpkhne3abu/checkpoint-500
Configuration saved in /tmp/tmpkhne3abu/checkpoint-500/config.json
Model weights saved in /tmp/tmpkhne3abu/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmpkhne3abu/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmpkhne3abu/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




#Training BERT Transformer

In [27]:
happy_wp_bert_positive = HappyWordPrediction("BERT", "bert-base-uncased")
happy_wp_bert_negative = HappyWordPrediction("BERT", "bert-base-uncased")
happy_wp_bert_positive.train("/content/drive/MyDrive/Deep Learning/Positive_Reviews.txt")

https://huggingface.co/bert-base-uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpoqvzcxoj


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
creating metadata file for /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "h

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f
creating metadata file for /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f
loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on a

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
creating metadata file for /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob":

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpxg3a3491


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
creating metadata file for /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://hugg

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (108854 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:17:40 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 212
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 636


Step,Training Loss
500,5.4793


Saving model checkpoint to /tmp/tmpzz2ylgl1/checkpoint-500
Configuration saved in /tmp/tmpzz2ylgl1/checkpoint-500/config.json
Model weights saved in /tmp/tmpzz2ylgl1/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmpzz2ylgl1/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmpzz2ylgl1/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [28]:
happy_wp_bert_negative.train("/content/drive/MyDrive/Deep Learning/Negative_Reviews.txt")

12/09/2021 20:21:03 - INFO - happytransformer.happy_transformer -   Preprocessing dataset...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (98106 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:21:04 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 191
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 573


Step,Training Loss
500,5.4009


Saving model checkpoint to /tmp/tmpiwblfy5l/checkpoint-500
Configuration saved in /tmp/tmpiwblfy5l/checkpoint-500/config.json
Model weights saved in /tmp/tmpiwblfy5l/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmpiwblfy5l/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmpiwblfy5l/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




#Training Albert Transformer

In [29]:
happy_wp_albert_positive = HappyWordPrediction("ALBERT", "albert-base-v2")
happy_wp_albert_negative = HappyWordPrediction("ALBERT", "albert-base-v2")
happy_wp_albert_positive.train("/content/drive/MyDrive/Deep Learning/Positive_Reviews.txt")
happy_wp_albert_negative.train("/content/drive/MyDrive/Deep Learning/Negative_Reviews.txt")

https://huggingface.co/albert-base-v2/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp63vkl0g1


Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

storing https://huggingface.co/albert-base-v2/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88
creating metadata file for /root/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88
loading configuration file https://huggingface.co/albert-base-v2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88
Model config AlbertConfig {
  "_name_or_path": "albert-base-v2",
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

storing https://huggingface.co/albert-base-v2/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/bf1986d976e9a8320cbd3a0597e610bf299d639ce31b7ca581cbf54be3aaa6d3.d6d54047dfe6ae844e3bf6e7a7d0aff71cb598d3df019361e076ba7639b1da9b
creating metadata file for /root/.cache/huggingface/transformers/bf1986d976e9a8320cbd3a0597e610bf299d639ce31b7ca581cbf54be3aaa6d3.d6d54047dfe6ae844e3bf6e7a7d0aff71cb598d3df019361e076ba7639b1da9b
loading weights file https://huggingface.co/albert-base-v2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/bf1986d976e9a8320cbd3a0597e610bf299d639ce31b7ca581cbf54be3aaa6d3.d6d54047dfe6ae844e3bf6e7a7d0aff71cb598d3df019361e076ba7639b1da9b
All model checkpoint weights were used when initializing AlbertForMaskedLM.

All the weights of AlbertForMaskedLM were initialized from the model checkpoint at albert-base-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use 

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

storing https://huggingface.co/albert-base-v2/resolve/main/spiece.model in cache at /root/.cache/huggingface/transformers/10be6ce6d3508f1fdce98a57a574283b47c055228c1235f8686f039287ff8174.d6110e25022b713452eb83d5bfa8ae64530995a93d8e694fe52e05aa85dd3a7d
creating metadata file for /root/.cache/huggingface/transformers/10be6ce6d3508f1fdce98a57a574283b47c055228c1235f8686f039287ff8174.d6110e25022b713452eb83d5bfa8ae64530995a93d8e694fe52e05aa85dd3a7d
https://huggingface.co/albert-base-v2/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmptz133j74


Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

storing https://huggingface.co/albert-base-v2/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/828a43aa4b9d07e2b7d3be7c6bc10a3ae6e16e8d9c3a0c557783639de9eaeb1b.670e237d152dd53ef77575d4f4a6cd34158db03128fe4f63437ce0d5992bac74
creating metadata file for /root/.cache/huggingface/transformers/828a43aa4b9d07e2b7d3be7c6bc10a3ae6e16e8d9c3a0c557783639de9eaeb1b.670e237d152dd53ef77575d4f4a6cd34158db03128fe4f63437ce0d5992bac74
loading file https://huggingface.co/albert-base-v2/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/10be6ce6d3508f1fdce98a57a574283b47c055228c1235f8686f039287ff8174.d6110e25022b713452eb83d5bfa8ae64530995a93d8e694fe52e05aa85dd3a7d
loading file https://huggingface.co/albert-base-v2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/828a43aa4b9d07e2b7d3be7c6bc10a3ae6e16e8d9c3a0c557783639de9eaeb1b.670e237d152dd53ef77575d4f4a6cd34158db03128fe4f63437ce0d5992bac74
loading file https://huggingfac

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (111420 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:24:16 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 217
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 651


Step,Training Loss
500,7.9614


Saving model checkpoint to /tmp/tmp1i4of1go/checkpoint-500
Configuration saved in /tmp/tmp1i4of1go/checkpoint-500/config.json
Model weights saved in /tmp/tmp1i4of1go/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmp1i4of1go/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmp1i4of1go/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


12/09/2021 20:26:48 - INFO - happytransformer.happy_transformer -   Preprocessing dataset...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (99680 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:26:49 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 194
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 582


Step,Training Loss
500,7.9477


Saving model checkpoint to /tmp/tmp9t5yklla/checkpoint-500
Configuration saved in /tmp/tmp9t5yklla/checkpoint-500/config.json
Model weights saved in /tmp/tmp9t5yklla/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmp9t5yklla/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmp9t5yklla/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




#Training Roberta Transformer

In [30]:
happy_wp_roberta_positive = HappyWordPrediction("ROBERTA", "roberta-base")
happy_wp_roberta_negative = HappyWordPrediction("ROBERTA", "roberta-base")
happy_wp_roberta_positive.train("/content/drive/MyDrive/Deep Learning/Positive_Reviews.txt")
happy_wp_roberta_negative.train("/content/drive/MyDrive/Deep Learning/Negative_Reviews.txt")

https://huggingface.co/roberta-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpwooyzqbt


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
creating metadata file for /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hid

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
creating metadata file for /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Robe

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
creating metadata file for /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
https://huggingface.co/roberta-base/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpgqylo0ts


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/roberta-base/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp04isornt


Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
creating metadata file for /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (108365 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:29:34 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 211
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 633


Step,Training Loss
500,4.853


Saving model checkpoint to /tmp/tmp6vc521vq/checkpoint-500
Configuration saved in /tmp/tmp6vc521vq/checkpoint-500/config.json
Model weights saved in /tmp/tmp6vc521vq/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmp6vc521vq/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmp6vc521vq/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


12/09/2021 20:33:20 - INFO - happytransformer.happy_transformer -   Preprocessing dataset...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (96902 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

12/09/2021 20:33:21 - INFO - happytransformer.happy_transformer -   Training...
PyTorch: setting up devices
***** Running training *****
  Num examples = 189
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 567


Step,Training Loss
500,4.873


Saving model checkpoint to /tmp/tmptdtyxjy5/checkpoint-500
Configuration saved in /tmp/tmptdtyxjy5/checkpoint-500/config.json
Model weights saved in /tmp/tmptdtyxjy5/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /tmp/tmptdtyxjy5/checkpoint-500/tokenizer_config.json
Special tokens file saved in /tmp/tmptdtyxjy5/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [34]:
def Processing_SearchQuery():
  input_rating = input('Enter the rating:')
  input_text = input('Enter the input:')
  if(input_text == '0'):
    print('Execution Completed....')
  else:
    input_text += '[MASK]'
  return input_rating, input_text
x = Processing_SearchQuery()

Enter the rating:4
Enter the input:chocolate


In [35]:
x

('4', 'chocolate[MASK]')

In [36]:
if(float(x[0]) > 3):
  positive_result = happy_wp_positive.predict_mask(x[1], top_k= 1)
  positive_result_bert = happy_wp_bert_positive.predict_mask(x[1], top_k = 1)
  positive_result_albert = happy_wp_albert_positive.predict_mask(x[1], top_k = 1)
  positive_result_roberta = happy_wp_roberta_positive.predict_mask(x[1], top_k = 1)
  print('Happy transformer predicted words for positive review:')
  for item in positive_result:
    print(item.token)
  print('Bert transformer predicted words for positive review:')
  for item in positive_result_bert:
    print(item.token)
  print('Albert transformer predicted words for positive review:')
  for item in positive_result_albert:
    print(item.token)
  print('Roberta transformer predicted words for positive review:')
  for item in positive_result_roberta:
    print(item.token)


elif(float(x[0]) < 3):
  negative_result = happy_wp_negative.predict_mask(x[1], top_k= 1)
  negative_result_bert = happy_wp_bert_negative.predict_mask(x[1], top_k = 1)
  negative_result_albert = happy_wp_albert_negative.predict_mask(x[1], top_k = 1)
  negative_result_roberta = happy_wp_roberta_negative.predict_mask(x[1], top_k = 1)
  print('Happy transformer predicted words for negative review:')
  for item in negative_result:
    print(item.token)
  print('Bert transformer predicted words for negative review:')
  for item in negative_result_bert:
    print(item.token)
  print('Albert transformer predicted words for negative review:')
  for item in negative_result_albert:
    print(item.token)
  print('Roberta transformer predicted words for negative review:')
  for item in negative_result_roberta:
    print(item.token)

Happy transformer predicted words for positive review:
.
Bert transformer predicted words for positive review:
.
Albert transformer predicted words for positive review:
s
Roberta transformer predicted words for positive review:
 cake


#Results

#Calculating Accuracy for Positive Review Models

In [49]:
#We are using the same review input to compare the accuracies of all the models.
positive = "soup melody not found jingle accompany book get audio version immediately not sooner sing whoopy whoopy twice whoopy chicken soup rice correct oomph missing take chicken soup rice readings seriously"

In [50]:
list_p = positive.split()

In [59]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_p)-1):
  try:
    #print('Entered....')
    temp = happy_wp_positive.predict_mask((list_p[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_p[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_p[i],pred_words[0],list_p[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 35.491142752714


In [63]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_p)-1):
  try:
    #print('Entered....')
    temp = happy_wp_bert_positive.predict_mask((list_p[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_p[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_p[i],pred_words[0],list_p[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 40.15090637882293


In [64]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_p)-1):
  try:
    #print('Entered....')
    temp = happy_wp_albert_positive.predict_mask((list_p[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_p[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_p[i],pred_words[0],list_p[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 29.009279201011555


In [81]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_p)-1):
  try:
    #print('Entered....')
    temp = happy_wp_roberta_positive.predict_mask((list_p[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_p[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_p[i],pred_words[0],list_p[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 36.93070266035354


#Calculating Accuracy for Negative Review Models

In [82]:
#We are using the same review input to compare the accuracies of all the models.
negative = 'still nothing get stuck actually saw second fly land watched flapped wings frantically within secs unstuck product total waste money could rate fly trap lower one star would think flies come miles'
list_n = negative.split()

In [109]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_n)-1):
  try:
    #print('Entered....')
    temp = happy_wp_negative.predict_mask((list_n[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_n[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_n[i],pred_words[0],list_n[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 29.459740678026176


In [94]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_n)-1):
  try:
    #print('Entered....')
    temp = happy_wp_bert_negative.predict_mask((list_n[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_n[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_n[i],pred_words[0],list_n[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 31.831651918761533


In [98]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_n)-1):
  try:
    #print('Entered....')
    temp = happy_wp_albert_negative.predict_mask((list_n[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == '+' or item.token == '=' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_n[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_n[i],pred_words[0],list_n[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 23.091679408347098


In [107]:
import spacy
nlp = spacy.load('en')
sum_distance = 0
flag = 0
for i in range(0,len(list_n)-1):
  try:
    #print('Entered....')
    temp = happy_wp_roberta_negative.predict_mask((list_n[i]+'[MASK]'), top_k= 20)
    #print(temp)
    pred_words = []
    for item in temp:
      if(item.token == ':' or item.token == '"' or item.token == '॥' or item.token == '।' or item.token == '|' or item.token == "'" or item.token == '-' or item.token == ',' or item.token == '##' or item.token == '!' or item.token == '.' or item.token == '[' or item.token == ']' or item.token == ';' or item.token == '..' or item.token == '...' or item.token == '?'):
        #print('if')
        pass
      else:
        pred_words.append(item.token)
    vec1 = nlp(pred_words[0])
    vec2 = nlp(list_n[i+1])
    sum_distance+=vec1.similarity(vec2)
    flag+=1
    #print(list_n[i],pred_words[0],list_n[i+1],vec1.similarity(vec2))
    #print("***")
  except:
    continue
print("accuracy",(sum_distance/flag)*100)

accuracy 32.82194168140801
