In [1]:
import regex as re
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('/kaggle/input/train-v2-drcat-02-csv/train_v2_drcat_02.csv')
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [3]:
train_ = train[train.RDizzl3_seven == False].reset_index(drop=True)
train_ = train[train["label"]==1].sample(8000, random_state=8)
train = train[train.RDizzl3_seven == True].reset_index(drop=True)
train = pd.concat([train, train_])
train['label'].value_counts()

label
0    14250
1    14200
Name: count, dtype: int64

# Data Preprocessing

In [4]:
%%time
def normalize(text):
    # Replace with whitespaces
    text = text.replace(r"\n", r" ")
    text = text.replace(r"\r", r" ")
    # Drop puntuation
    text = re.sub(r"\p{P}", " ", text)
    # Remove extra spaces 
    text = re.sub(r"\s+", r" ", text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

train['text'] = train['text'].apply(lambda x: normalize(x))
test['text'] = test['text'].apply(lambda x: normalize(x))

CPU times: user 3.88 s, sys: 9.55 ms, total: 3.89 s
Wall time: 3.9 s


In [5]:
%%time
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(3, 4),
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             token_pattern=None,
                             strip_accents='unicode',
                             )

vectorizer = vectorizer.fit(test['text'])
X = vectorizer.transform(df)
X

CPU times: user 16.6 s, sys: 5.03 ms, total: 16.6 s
Wall time: 16.6 s


<28453x3 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

# Model-Training

In [6]:
%%time
lr_model = LogisticRegression()
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=42)

ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('sgd', sgd_model),
                                       ],
                            weights=[0.01, 0.99],
                            voting='soft'
                           )
ensemble.fit(X[:train.shape[0]], train.label)

CPU times: user 20.3 ms, sys: 1.98 ms, total: 22.3 ms
Wall time: 30.3 ms


In [7]:
preds_test = ensemble.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)