### TPU Initialization

In [2]:
import tensorflow as tf

### Import of utilities

In [3]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [4]:
import os
import random
import re
import json  
import spacy
import joblib
import numpy as np
import pandas as pd
from time import time
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
#### Random seed
def reset_random_seeds(CUR_SEED=42):
    os.environ['PYTHONHASHSEED'] = str(CUR_SEED)
    tf.random.set_seed(CUR_SEED)
    np.random.seed(CUR_SEED)
    random.seed(CUR_SEED)
    
reset_random_seeds()

In [6]:
from typing import Union, Callable, List
if tf.__version__[:3] > "2.5":
    from keras.engine import keras_tensor
else:
    from tensorflow.python.keras.engine import keras_tensor
    
Number = Union[
    float,
    int,
    np.float16,
    np.float32,
    np.float64,
    np.int8,
    np.int16,
    np.int32,
    np.int64,
    np.uint8,
    np.uint16,
    np.uint32,
    np.uint64,
]

TensorLike = Union[
    List[Union[Number, list]],
    tuple,
    Number,
    np.ndarray,
    tf.Tensor,
    tf.SparseTensor,
    tf.Variable,
    keras_tensor.KerasTensor,
]

In [7]:
def npairs_loss(y_true: TensorLike, y_pred: TensorLike) -> tf.Tensor:
    """Computes the npairs loss between `y_true` and `y_pred`.

    Npairs loss expects paired data where a pair is composed of samples from
    the same labels and each pairs in the minibatch have different labels.
    The loss takes each row of the pair-wise similarity matrix, `y_pred`,
    as logits and the remapped multi-class labels, `y_true`, as labels.

    The similarity matrix `y_pred` between two embedding matrices `a` and `b`
    with shape `[batch_size, hidden_size]` can be computed as follows:

    >>> a = tf.constant([[1, 2],
    ...                 [3, 4],
    ...                 [5, 6]], dtype=tf.float16)
    >>> b = tf.constant([[5, 9],
    ...                 [3, 6],
    ...                 [1, 8]], dtype=tf.float16)
    >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
    >>> y_pred
    <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
    array([[23., 15., 17.],
       [51., 33., 35.],
       [79., 51., 53.]], dtype=float16)>

    <... Note: constants a & b have been used purely for
    example purposes and have no significant value ...>

    See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf

    Args:
      y_true: 1-D integer `Tensor` with shape `[batch_size]` of
        multi-class labels.
      y_pred: 2-D float `Tensor` with shape `[batch_size, batch_size]` of
        similarity matrix between embedding matrices.

    Returns:
      npairs_loss: float scalar.
    """
    y_pred = tf.convert_to_tensor(y_pred)
    y_true = tf.cast(y_true, y_pred.dtype)

    # Expand to [batch_size, 1]
    #y_true = tf.expand_dims(y_true, -1)
    y_true = tf.cast(tf.equal(y_true, tf.transpose(y_true)), y_pred.dtype)
    y_true /= tf.math.reduce_sum(y_true, 1, keepdims=True)

    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true)

    return tf.math.reduce_mean(loss)

In [8]:
complete_dataset = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv',sep=",")
complete_dataset
dataset = complete_dataset['text']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [10]:
spacy.require_gpu()
english_model = spacy.load("en_core_web_sm")

#1: Lengths of sentences and paragraphs:
def calculate_average_sentence_length(list_of_doctexts):
    list_of_lengths = []
    sum_of_lengths = 0
    count_lengths = 0
    for doc in list_of_doctexts:
        for sent in doc.sents:
            count_lengths += 1
            sum_of_lengths += len(sent)
        list_of_lengths.append(sum_of_lengths/count_lengths)
    return list_of_lengths

def calculate_average_paragraph_lengths(list_of_doctexts):
    list_of_lengths = []
    sum_of_lengths = 0
    for doc in list_of_doctexts:
        paragraph_lengths = []
        current_paragraph_length = 0
        for token in doc:
            if token.is_space and token.text == "\n":
                paragraph_lengths.append(current_paragraph_length)
                current_paragraph_length = 0
            else:
                current_paragraph_length += 1
        paragraph_lengths.append(current_paragraph_length)
        list_of_lengths.append(sum(paragraph_lengths)/len(paragraph_lengths))
    return list_of_lengths
    
#2. Use of Punctuation:
def calculate_punctuation_frequency(list_of_doctexts):
    list_of_frequencys = []
    for doc in list_of_doctexts:
        punctuation_frequency = len([token for token in doc if token.is_punct])
        list_of_frequencys.append(punctuation_frequency)
    return list_of_frequencys

#3. Stop Words Report:
def calculate_stopword_ratio(list_of_doctexts):
    list_of_ratios = []
    for doc in list_of_doctexts:
        total_words = len([token for token in doc if token.is_alpha])
        stop_words = len([token for token in doc if token.is_stop])
        stopword_ratio = stop_words / total_words if total_words > 0 else 0
        list_of_ratios.append(stopword_ratio)
    return list_of_ratios


In [11]:
def vectorize_dataset(dataset, vocabulary = None):
    t0 = time()
    vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english", max_features=10000, vocabulary=vocabulary
    )

    csr_matrix = vectorizer.fit_transform(dataset)
    duration_train = time() - t0
    print(duration_train)
    
    print(csr_matrix.shape)
    print(csr_matrix)
    
    return (vectorizer.vocabulary_, csr_matrix)

In [12]:
f = open('/kaggle/input/svm-model/vocabulary.json')
data = json.load(f)

f.close()

In [13]:
vocabulary, csr_matrix = vectorize_dataset(dataset, data)

0.020565032958984375
(3, 10000)
  (0, 171)	1.0


In [14]:
features_dataset = pd.DataFrame.sparse.from_spmatrix(csr_matrix)

In [15]:
def compute_docs_from_dataset(dataset):
    t0 = time()
    docs = list(english_model.pipe(dataset, batch_size=64))
    duration_train = time() - t0
    print(duration_train)
    return docs

In [16]:
docs_from_dataset = compute_docs_from_dataset(dataset)

14.43764352798462


In [17]:
# dataset = dataset.reset_index().drop(["index"], axis=1)

In [18]:
def add_statistics_columns(features_dataset, dataset, docs_from_dataset):
    features_dataset['len_text'] = dataset.apply(len)
    features_dataset['sentence_lengths'] = calculate_average_sentence_length(docs_from_dataset)
    features_dataset['paragraph_lengths'] = calculate_average_paragraph_lengths(docs_from_dataset)
    features_dataset['punctuation_frequency'] = calculate_punctuation_frequency(docs_from_dataset)
    features_dataset['stopword_ratio'] = calculate_stopword_ratio(docs_from_dataset)

    features_dataset.columns = features_dataset.columns.astype(str)
    
    print(features_dataset)

In [19]:
add_statistics_columns(features_dataset, dataset, docs_from_dataset)

     0    1    2    3    4    5    6    7    8    9  ...  9995  9996  9997  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   9998  9999  len_text  sentence_lengths  paragraph_lengths  \
0   0.0   0.0        12               4.0                4.0   
1   0.0   0.0        12               4.0                4.0   
2   0.0   0.0        12               4.0                4.0   

   punctuation_frequency  stopword_ratio  
0                      1             0.0  
1                      1             0.0  
2                      1             0.0  

[3 rows x 10005 columns]


In [20]:
import joblib
loaded_svm_model = joblib.load('/kaggle/input/svm-model/svm_model.joblib')
predictions_svm = loaded_svm_model.predict(features_dataset)



In [21]:
from transformers import TFBertModel, TFGPT2Model, AutoTokenizer
 
bert_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/pretrained-tokenizers/bert-base-cased')
gpt_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/pretrained-tokenizers/gpt2')

In [22]:
tokens_length = 512
projection_units = 64
learning_rate = 0.001
temperature = 0.05
dropout_rate = 0.5
num_epochs = 10

In [23]:
if gpt_tokenizer.pad_token is None:
    gpt_tokenizer.add_special_tokens({'pad_token': '0'})

In [24]:
def tokenize_dataset(tokenizer, dataset):
    tokenized_dataset = tokenizer.batch_encode_plus(
        dataset.to_list(),
        add_special_tokens= True,
        max_length=tokens_length,
        padding='max_length', truncation=True,
        return_attention_mask=True,
        return_tensors='tf')
    return tokenized_dataset

In [25]:
bert_tokenization = tokenize_dataset(bert_tokenizer, dataset)
gpt_tokenization = tokenize_dataset(gpt_tokenizer, dataset)

dataset_input = {"bert_input_word_ids": bert_tokenization['input_ids'], 
               "gpt_input_word_ids": gpt_tokenization['input_ids']}

In [26]:
def build_model():
    bert_model = TFBertModel.from_pretrained('/kaggle/input/pretrained-models/bert-base-cased')
    gpt_model = TFGPT2Model.from_pretrained('/kaggle/input/pretrained-models/gpt2')
    bert_model._name = "bert_patterns_recognizer"
    gpt_model._name = "gpt_patterns_recognizer"
    
    bert_input_word_ids = tf.keras.Input(shape=(tokens_length,), dtype=tf.int32, name="bert_input_word_ids")
    gpt_input_word_ids = tf.keras.Input(shape=(tokens_length,), dtype=tf.int32, name="gpt_input_word_ids")
    
    bert_embedding = bert_model([bert_input_word_ids])[0]
    gpt_embedding = gpt_model([gpt_input_word_ids])[0]

    output = tf.keras.layers.Concatenate()([bert_embedding, gpt_embedding])
       
    model = tf.keras.Model(inputs=[bert_input_word_ids, gpt_input_word_ids], outputs=output)
    model.compile()  
    return model 

In [27]:
class SupervisedContrastiveLoss(tf.keras.losses.Loss):
    def __init__(self, temperature=1, name=None):
        super(SupervisedContrastiveLoss, self).__init__(name=name)
        self.temperature = temperature
    
    def __call__(self, labels, feature_vectors, sample_weight):
        normalized_feature_vectors = tf.math.l2_normalize(feature_vectors, axis=1)
        logits = tf.divide(
            tf.matmul(normalized_feature_vectors, tf.transpose(normalized_feature_vectors)),
            self.temperature
        )

        return npairs_loss(labels, logits)

def add_projection_head(encoder):
    bert_input_word_ids = tf.keras.Input(shape=(tokens_length,), dtype=tf.int32, name="bert_input_word_ids")
    gpt_input_word_ids = tf.keras.Input(shape=(tokens_length,), dtype=tf.int32, name="gpt_input_word_ids")
    #encoder.trainable = False
    features = encoder([bert_input_word_ids, gpt_input_word_ids])
    features = layers.Flatten()(features)
    outputs = tf.keras.layers.Dense(projection_units, activation="relu")(features)
    model = tf.keras.Model(
        [bert_input_word_ids, gpt_input_word_ids], outputs, name="encoder-with-projection"
    )
    return model

In [28]:
encoder = build_model()
encoder.load_weights('/kaggle/input/bert-gpt-model-checkpoint/training_1/cp.ckpt')

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /kaggle/input/pretrained-models/bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at /kaggle/input/pretrained-models/gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ae598177bb0>

In [29]:
def create_classifier(encoder):
    for layer in encoder.layers:
        layer.trainable = False
     
    bert_input_word_ids = tf.keras.Input(shape=(tokens_length,), dtype=tf.int32, name="bert_input_word_ids")
    gpt_input_word_ids = tf.keras.Input(shape=(tokens_length,), dtype=tf.int32, name="gpt_input_word_ids")
    features = encoder([bert_input_word_ids, gpt_input_word_ids])
    features = layers.Flatten()(features)
    features = layers.Dropout(dropout_rate)(features)
    outputs = layers.Dense(1, activation="sigmoid")(features)

    model = keras.Model(inputs=[bert_input_word_ids, gpt_input_word_ids], outputs=outputs, name="classifier-gpt-bert-representations")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.AUC(), 
                 keras.metrics.Precision(), keras.metrics.Recall()],
    )
    return model

In [30]:
classifier = create_classifier(encoder)
classifier.load_weights('/kaggle/input/bert-gpt-model-checkpoint/training_2/cp.ckpt')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ae5806e3400>

In [31]:
predictions_classifier = classifier.predict(dataset_input, batch_size=64)
predictions_classifier = [1 if pred > 0.5 else 0 for pred in predictions_classifier]



In [32]:
input_length = 1

def build_final_model():
    input_svm = tf.keras.Input(shape=(input_length,), dtype=tf.int32, name="input_svm")
    input_pretrained = tf.keras.Input(shape=(input_length,), dtype=tf.int32, name="input_pretrained")
    concatenated_svm_pretrained = tf.keras.layers.Concatenate()([input_svm, input_pretrained])

    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(concatenated_svm_pretrained)
    
    model = keras.Model(inputs=[input_svm, input_pretrained], outputs=outputs, name="final-classifier")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.AUC(),keras.metrics.Precision(), keras.metrics.Recall()],
    )
    return model

In [33]:
final_model = build_final_model()
final_model.load_weights('/kaggle/input/final-model-training/final_training/cp.ckpt')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ae5802d6cb0>

In [34]:
predictions_svm = pd.DataFrame(predictions_svm)
predictions_classifier = pd.DataFrame(predictions_classifier)

In [35]:
final_input = {
    "input_svm" : predictions_svm,
    "input_pretrained" : predictions_classifier
}

In [36]:
predictions_final = final_model.predict(final_input, batch_size=64)



In [37]:
submission = pd.DataFrame(columns=['id', 'generated'])
submission['id'] = complete_dataset['id']
submission['generated'] = predictions_final

In [38]:
submission.to_csv('/kaggle/working/submission.csv', index=False)