# Roberta Model Test on Paraphrased Articles using GPT-3

In [1]:
!pip install ipython-autotime
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 35.6 MB/s 
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.18.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 11.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 60.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB

In [2]:
%load_ext autotime

time: 270 µs (started: 2023-01-03 16:11:45 +00:00)


In [4]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive
time: 39.4 s (started: 2023-01-03 16:11:49 +00:00)


In [3]:
filepath = "/content/drive/MyDrive/Success of AI Writers/github"

time: 357 µs (started: 2023-01-03 16:11:47 +00:00)


In [5]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
import transformers
import re
import gensim
from gensim.matutils import softcossim
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

fasttext_model300 = api.load("fasttext-wiki-news-subwords-300")

time: 7min 52s (started: 2023-01-03 16:12:29 +00:00)


#RoBERTa

In [6]:
model = keras.models.load_model(f"{filepath}/success-of-ai-writers/models/roberta-base")

time: 32.9 s (started: 2023-01-03 16:20:22 +00:00)


In [16]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7fc6b91973d0>,
 <keras.engine.input_layer.InputLayer at 0x7fc6b9bfcb50>,
 <keras.engine.input_layer.InputLayer at 0x7fc6b9bfce20>,
 <keras.saving.saved_model.load.Custom>TFRobertaMainLayer at 0x7fc6b9bfcfa0>,
 <keras.layers.rnn.bidirectional.Bidirectional at 0x7fc74b182b20>,
 <keras.layers.pooling.global_average_pooling1d.GlobalAveragePooling1D at 0x7fc6b04c7ac0>,
 <keras.layers.pooling.global_max_pooling1d.GlobalMaxPooling1D at 0x7fc6b04e0280>,
 <keras.layers.merging.concatenate.Concatenate at 0x7fc6b04e0460>,
 <keras.layers.regularization.dropout.Dropout at 0x7fc6b04e0640>,
 <keras.layers.core.dense.Dense at 0x7fc6b04e0850>]

time: 3.77 ms (started: 2023-01-03 16:21:00 +00:00)


In [7]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 4
labels = ["contradiction", "entailment", "neutral"]

time: 606 µs (started: 2023-01-03 16:20:55 +00:00)


In [8]:
# train_df.head()
class RoBertaSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.
    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.
    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.RobertaTokenizer.from_pretrained(
            "roberta-base", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

time: 4.53 ms (started: 2023-01-03 16:20:55 +00:00)


In [9]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = RoBertaSemanticDataGenerator(
        sentence_pairs,
        labels=None,
        batch_size=1,
        shuffle=False,
        include_targets=False,
    )
    # print("booo model")
    proba = model.predict(test_data[0], verbose=0)[0]
    # print(labels[0],"->",proba[0],"\n",labels[1],"->",proba[1],"\n",labels[2],"->",proba[2])
    idx = np.argmax(proba)
    # proba = f"{proba[idx]: .2f}%"
    proba = proba[idx]
    pred = labels[idx]

    return pred, proba

time: 899 µs (started: 2023-01-03 16:20:55 +00:00)


In [10]:
def soft_cosine_sim(text1, text2):
    dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in [text1, text2]])
    similarity_matrix = fasttext_model300.similarity_matrix(
        dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100
    )
    sent_1 = dictionary.doc2bow(simple_preprocess(text1))
    sent_2 = dictionary.doc2bow(simple_preprocess(text2))
    sentences = [sent_1, sent_2]
    return softcossim(sent_1, sent_2, similarity_matrix)

time: 1.14 ms (started: 2023-01-03 16:20:55 +00:00)


In [11]:
def paragraph_checker(paragraph_1, paragraph_2):
    total_similarity_score = 0
    similarity_list = []

    for id_o, o_line in enumerate(paragraph_1):
        max_similarity = 0

        if len(o_line) < 10:
            break

        for id_c, c_line in enumerate(paragraph_2):
            or_list = list()
            # print(o_line, "\n", c_line)

            if len(c_line) < 10:
                break

            if o_line == c_line:
                max_similarity = 1
                or_list.append((id_c, 100))
                similarity_list.append((id_o, or_list))
                break

            # Find a better similarit func or set threshold
            if soft_cosine_sim(o_line, c_line) > 0.764:
                plg_type, sm_ratio = check_similarity(o_line, c_line)
                # print(sm_ratio)
                if plg_type == "entailment":
                    # print("yey entailment")
                    or_list.append((id_c, sm_ratio))
                    if sm_ratio > max_similarity:
                        max_similarity = sm_ratio

            # print(or_list)
            # This method here cause of the chance to existence of more than one similar senctence
            # Stop thinking about it, you already DID!
            if len(or_list) != 0:
                similarity_list.append((id_o, or_list))

        total_similarity_score += max_similarity
    return (total_similarity_score * 100) / len(paragraph_1), similarity_list

time: 1.61 ms (started: 2023-01-03 16:20:55 +00:00)


In [12]:
def semantic_checker(original_text, suspected_text):
    sentence_enders = re.compile("[.!?][\s]{1,2}(?=[A-Z0-9])")
    text_1 = sentence_enders.split(original_text)
    text_2 = sentence_enders.split(suspected_text)
    # print(len(text_1))
    # print(len(text_2))
    avg_sim_score, results = paragraph_checker(text_1, text_2)
    # print(avg_sim_score)
    # print(results)

    # return avg_sim_score,results
    return avg_sim_score

time: 1.12 ms (started: 2023-01-03 16:20:55 +00:00)


In [13]:
data_path = "/content/drive/MyDrive/Success of AI Writers/github/success-of-ai-writers/data/external/paraphrased_articles.csv"

time: 434 µs (started: 2023-01-03 16:20:55 +00:00)


In [14]:
df_article = pd.read_csv(data_path)

time: 642 ms (started: 2023-01-03 16:20:55 +00:00)


In [38]:
title_similarities = []
abstract_similarities = []
introduction_similarities = []
for row in range(20, 30):
    title_similarities.append(
        semantic_checker(
            df_article.iloc[row].Title, df_article.iloc[row].ParaphrasedTitle
        )
    )
    abstract_similarities.append(
        semantic_checker(
            df_article.iloc[row].Abstract, df_article.iloc[row].ParaphrasedAbstract
        )
    )
    introduction_similarities.append(
        semantic_checker(
            df_article.iloc[row].Introduction,
            df_article.iloc[row].ParaphraseIntroduction,
        )
    )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

time: 7min 26s (started: 2023-01-03 17:41:11 +00:00)


In [40]:
result_similarities = [
    title_similarities,
    abstract_similarities,
    introduction_similarities,
]
df = pd.DataFrame(result_similarities).transpose()
df

Unnamed: 0,0,1,2
0,85.10828,92.58523,66.356087
1,94.518346,84.812021,74.738876
2,93.051451,63.503592,43.846064
3,95.468861,73.130045,86.442584
4,91.540426,82.858285,63.163817
5,95.806181,79.320188,89.248628
6,98.119801,40.159011,68.372733
7,83.398646,66.068461,57.915738
8,0.0,83.66111,3.328791
9,0.0,90.118303,36.853301


time: 9.09 ms (started: 2023-01-03 17:54:51 +00:00)


In [41]:
df.columns = [
    "title_similarities",
    "abstract_similarities",
    "introduction_similarities",
]
df.to_csv("/roberta_article_results.csv")

time: 10 ms (started: 2023-01-03 17:54:54 +00:00)
