In [5]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
import tqdm as notebook_tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [7]:
train = pd.read_csv("data/train_v2_drcat_02.csv")
train1 = train[train.RDizzl3_seven == False].reset_index(drop=True)
train1=train[train["label"]==1].sample(8000)
train = train[train.RDizzl3_seven == True].reset_index(drop=True)

df=pd.concat([train,train1])

In [9]:
test = pd.read_csv('data/test_essays.csv')
sub = pd.read_csv('data/sample_submission.csv')
#org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

#train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [10]:
df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Cars have been around for awhile and they have...,0,Car-free cities,persuade_corpus,True
1,Have you ever thought what it would be like no...,0,Car-free cities,persuade_corpus,True
2,What you are about to read is going to give yo...,0,Car-free cities,persuade_corpus,True
3,cars have many flaws nd and in this day and ag...,0,Car-free cities,persuade_corpus,True
4,There are many advantages of limiting car usag...,0,Car-free cities,persuade_corpus,True
...,...,...,...,...,...
34775,A local workplace that I would like to pursue...,1,Distance learning,mistral7binstruct_v2,False
40920,Limiting car usage is essential for the well...,1,Car-free cities,NousResearch/Llama-2-7b-chat-hf,True
39664,"Sure, here's my attempt at writing an essay re...",1,Seeking multiple opinions,llama2_chat,False
30242,The idea of implementing a four-day work week...,1,Mandatory extracurricular activities,mistral7binstruct_v2,False


In [11]:
train = df.drop_duplicates(subset=['text'])

train.reset_index(drop=True, inplace=True)

In [12]:
test.text.values

array(['Aaa bbb ccc.', 'Bbb ccc ddd.', 'CCC ddd eee.'], dtype=object)

In [13]:
LOWERCASE = False
VOCAB_SIZE = 50000 

In [14]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)


# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 754.10it/s]
100%|███████████████████████████████████████████████████████████████████████████| 25624/25624 [02:25<00:00, 175.76it/s]


In [15]:
tokenized_texts_test[1]

['ĠBbb', 'Ġccc', 'Ġddd', '.']

In [16]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

In [18]:
# Fitting TfidfVectoizer on test set

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode'
                            )

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)


# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


36

In [19]:
y_train = train['label'].values

In [20]:
bayes_model = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")


ensemble = VotingClassifier(estimators=[('sgd', sgd_model), ('nb', bayes_model)],
                            weights=[0.9, 0.1], voting='soft', n_jobs=-1)
ensemble.fit(tf_train, y_train)


gc.collect()


26

In [24]:
final_preds = ensemble.predict_proba(tf_test)

In [25]:
final_preds

array([[0.55400209, 0.44599791],
       [0.55400209, 0.44599791],
       [0.55400209, 0.44599791]])

In [23]:
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)