# Albert Model Test on Paraphrased Articles using GPT-3

In [1]:
!pip install ipython-autotime
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 6.7 MB/s 
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.18.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 6.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 51.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K   

In [2]:
%load_ext autotime

time: 449 µs (started: 2023-01-03 11:22:10 +00:00)


In [4]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive
time: 17.3 s (started: 2023-01-03 11:22:20 +00:00)


In [5]:
filepath="/content/drive/MyDrive/Success of AI Writers/github"

time: 804 µs (started: 2023-01-03 11:22:37 +00:00)


In [6]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
import transformers
import re
import gensim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

time: 8min 36s (started: 2023-01-03 11:22:37 +00:00)


In [7]:
model = keras.models.load_model(f"{filepath}/success-of-ai-writers/models/albert-base")

time: 21.6 s (started: 2023-01-03 11:31:13 +00:00)


In [8]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 8
labels = ["contradiction", "entailment", "neutral"]

time: 719 µs (started: 2023-01-03 11:31:35 +00:00)


In [9]:
# train_df.head()
class AlbertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.
    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.
    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.AlbertTokenizer.from_pretrained("albert-base-v2")
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

time: 4.87 ms (started: 2023-01-03 11:31:35 +00:00)


In [10]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = AlbertSemanticDataGenerator(
        sentence_pairs,
        labels=None,
        batch_size=1,
        shuffle=False,
        include_targets=False,
    )
    # print("booo model")
    proba = model.predict(test_data[0], verbose=0)[0]
    # print(labels[0],"->",proba[0],"\n",labels[1],"->",proba[1],"\n",labels[2],"->",proba[2])
    idx = np.argmax(proba)
    # proba = f"{proba[idx]: .2f}%"
    proba = proba[idx]
    pred = labels[idx]

    return pred, proba

time: 1.37 ms (started: 2023-01-03 11:31:35 +00:00)


In [11]:
def soft_cosine_sim(text1,text2):
  dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in [text1,text2]])
  similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)
  sent_1 = dictionary.doc2bow(simple_preprocess(text1))
  sent_2 = dictionary.doc2bow(simple_preprocess(text2))
  sentences = [sent_1, sent_2]
  return softcossim(sent_1, sent_2, similarity_matrix)

time: 1.22 ms (started: 2023-01-03 11:31:35 +00:00)


In [12]:
def paragraph_checker(paragraph_1, paragraph_2):
    total_similarity_score = 0
    similarity_list = []

    for id_o, o_line in enumerate(paragraph_1):
      max_similarity = 0

      if len(o_line) < 10:
          break
      
      for id_c, c_line in enumerate(paragraph_2):
        or_list = list()
        #print(o_line, "\n", c_line)

        if len(c_line) < 10:
          break

        if o_line == c_line:
          max_similarity = 1
          or_list.append((id_c, 100))
          similarity_list.append((id_o,or_list))
          break

        # Find a better similarit func or set threshold 
        if soft_cosine_sim(o_line,c_line)>0.764:
          plg_type, sm_ratio = check_similarity(o_line, c_line)
          # print(sm_ratio)
          if plg_type == "entailment":
            # print("yey entailment")
            or_list.append((id_c, sm_ratio))
            if sm_ratio > max_similarity:
              max_similarity = sm_ratio

        # print(or_list)
        # This method here cause of the chance to existence of more than one similar senctence 
        # Stop thinking about it, you already DID!
        if len(or_list) != 0:
          similarity_list.append((id_o,or_list))

      total_similarity_score += max_similarity
    return (total_similarity_score*100)/len(paragraph_1) , similarity_list

time: 5.77 ms (started: 2023-01-03 11:31:35 +00:00)


In [13]:
def semantic_checker(original_text, suspected_text):
  sentence_enders = re.compile("[.!?][\s]{1,2}(?=[A-Z0-9])")
  text_1 = sentence_enders.split(original_text)
  text_2 = sentence_enders.split(suspected_text)
  # print(len(text_1))
  # print(len(text_2))
  avg_sim_score, results = paragraph_checker(text_1, text_2)
  # print(avg_sim_score)
  # print(results)

  # return avg_sim_score,results
  return avg_sim_score

time: 1.51 ms (started: 2023-01-03 11:31:35 +00:00)


In [14]:
data_path="/content/drive/MyDrive/Success of AI Writers/github/success-of-ai-writers/data/external/paraphrased_articles.csv"

time: 673 µs (started: 2023-01-03 11:31:35 +00:00)


In [15]:
df_article = pd.read_csv(data_path)

time: 547 ms (started: 2023-01-03 11:31:35 +00:00)


In [16]:
df_article

Unnamed: 0,Title,Abstract,Introduction,ParaphrasedTitle,ParaphrasedAbstract,ParaphraseIntroduction,URL
0,Detecting Euphemisms with Literal Descriptions...,This paper describes our two-stage system1 for...,Recent advances in large pretrained language m...,Finding Euphemisms with Literal Interpretation...,Our two-stage system for the Euphemism Detecti...,Recent advances in large pretrained language m...,https://arxiv.org/pdf/2211.04576.pdf
1,Disentangling Content and Motion for Text-Base...,Giving machines the ability to imagine possibl...,Making desired edits on an image or video usin...,Separating Content and Motion for Neural Video...,Developing machines that can generate realisti...,Creating desired edits on images or videos usi...,https://arxiv.org/pdf/2211.02980.pdf
2,PERCEPTION-DISTORTION TRADE-OFF IN THE SR SPAC...,Flow-based generative super-resolution (SR) mo...,Deep-learning based super-resolution (SR) meth...,The Trade-off between Perception and Distortio...,Flow-based generative super-resolution (SR) mo...,Deep-learning based super-resolution (SR) meth...,https://arxiv.org/pdf/2209.08564.pdf
3,"""BE Y O N D T H E I M I TAT I O N G A M E : QU...",Language models demonstrate both quantitative ...,Generative language models have as their core ...,Language Models' Capabilities Quantified and E...,Language models show improvement and new capab...,Generative language models have the capability...,https://arxiv.org/pdf/2206.04615.pdf
4,Multi-Contrast MRI Synthesis with Channel-Exch...,Magnetic resonance imaging (MRI) is used in ma...,Magnetic resonance imaging (MRI) is used in ma...,Synthesizing MRI Images with Channel-Exchangin...,MRI has high soft-tissue contrast and is a non...,MRI is a non-invasive medical imaging method w...,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...
...,...,...,...,...,...,...,...
64,Articulation Prior in an Axial Representation,Local symmetry axis based schemes have been us...,Generic shape recognition demands representati...,Axial Representation's Precedence,"Axial representations, while providing articul...",Generic shape recognition requires representat...,https://web.cs.hacettepe.edu.tr/~erkut/publica...
65,Edge Strength Functions as Shape Priors in Ima...,Many applications of computer vision requires ...,"In many vision applications, one searches an o...",Shape Priors in Image Segmentation Utilizing E...,Motivated by the unlevel-sets formulation of R...,"In vision applications, the challenge of findi...",https://aykuterdem.github.io/papers/eet05.pdf
66,Vision-based continuous Graffit-like text entr...,"It is now possible to design real-time, low-co...",We address the problem of entering ASCII text ...,Continuous text input system based on vision s...,Recent advancements in electronics and the com...,We tackle the issue of entering ASCII text int...,http://repository.bilkent.edu.tr/bitstream/han...
67,COMPUTER VISlON BASED UNISTROKE KEYBOARD SYSTE...,"In this paper, a unistroke keyboard based on c...","In this paper, a unistroke keyboard and a mous...",System to Aid Handicapped with Computer Vision...,A unistroke keyboard is proposed for the handi...,A unistroke keyboard and mouse-like system bas...,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...


time: 32.6 ms (started: 2023-01-03 11:31:36 +00:00)


In [23]:
title_similarities=[]
abstract_similarities=[]
introduction_similarities=[]
for row in range(len(df_article)):
    title_similarities.append(semantic_checker(df_article.iloc[row].Title,df_article.iloc[row].ParaphrasedTitle))
    abstract_similarities.append(semantic_checker(df_article.iloc[row].Abstract,df_article.iloc[row].ParaphrasedAbstract))
    introduction_similarities.append(semantic_checker(df_article.iloc[row].Introduction,df_article.iloc[row].ParaphraseIntroduction))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

time: 53min 43s (started: 2023-01-03 14:48:03 +00:00)


In [24]:
result_similarities = [title_similarities,abstract_similarities,introduction_similarities]
df = pd.DataFrame (result_similarities).transpose()
df

Unnamed: 0,0,1,2
0,74.168676,90.878666,73.880762
1,82.629758,60.427921,79.406373
2,91.705072,87.436888,81.715534
3,86.557388,78.633422,17.44842
4,83.763075,73.821208,85.095527
5,86.038679,72.204836,44.445167
6,0.0,77.508845,33.756112
7,0.0,79.833903,6.220556
8,93.343014,57.533308,25.830829
9,0.0,69.261954,70.108865


time: 19.7 ms (started: 2023-01-03 15:41:50 +00:00)


In [25]:
df.columns = ['title_similarities', 'abstract_similarities',"introduction_similarities"]
df.to_csv("/albert_article_results.csv")

time: 15.5 ms (started: 2023-01-03 15:42:00 +00:00)
