# AML NLP project

## Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
nltk.download('punkt_tab')
from nltk import tokenize
from matplotlib import pyplot as plt
import regex as re
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model, metrics
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Read data

Data can be found here : https://www.kaggle.com/datasets/thedrcat/daigt-v2-train-dataset/data

In [2]:
text_data = pd.read_csv("/datasets/data/train_v2_drcat_02.csv")

Let's keep the original prompts

In [3]:
seven_text_data = text_data[text_data['RDizzl3_seven'] == True]

In [4]:
# Remove the " " " in the prompt names
seven_text_data['prompt_name'] = seven_text_data['prompt_name'].str.replace('''"''','')
# seven_text_data['prompt_name'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seven_text_data['prompt_name'] = seven_text_data['prompt_name'].str.replace('''"''','')


## Data mining and Feature engineering

### Visualization

Class imbalance

In [5]:
seven_text_data.groupby('label').count()

Unnamed: 0_level_0,text,prompt_name,source,RDizzl3_seven
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,14250,14250,14250,14250
1,6200,6200,6200,6200


We notice a huge class imbalance : there are more handwritten text than AI generated

Add essay length column

In [6]:
# seven_text_data['length'] = seven_text_data['text'].str.len()
# seven_text_data.head()

In [7]:
# mapping = dict(zip(seven_text_data['prompt_name'].unique(), range(7)))
# seven_text_data['prompt_id'] = seven_text_data.replace(mapping)['prompt_name']


In [6]:
mapping_source = dict(zip(seven_text_data['source'].unique(), range(len(seven_text_data['source'].unique()))))
seven_text_data['source_id'] = seven_text_data.replace(mapping_source)['source']
sorted_seven_text_data = seven_text_data.groupby('source').count().sort_values("text", ascending=False)
# sns.barplot(sorted_seven_text_data,x = 'text', y = 'source', orient = 'h')
# plt.title("Number of essays by source")

  seven_text_data['source_id'] = seven_text_data.replace(mapping_source)['source']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seven_text_data['source_id'] = seven_text_data.replace(mapping_source)['source']


The sources are highly skewed so me might need to balance this later

In [7]:
def word_count(text):
    text_list = text.split()
    return len(text_list)


In [8]:
seven_text_data['word_count'] = seven_text_data['text'].apply(word_count)
# seven_text_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seven_text_data['word_count'] = seven_text_data['text'].apply(word_count)


In [9]:
seven_text_data['mean_word_length'] = seven_text_data['text'].apply(
    lambda x: np.mean([len(word) for word in x.split()])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seven_text_data['mean_word_length'] = seven_text_data['text'].apply(


In [12]:
seven_text_data['mean_sent_length'] = seven_text_data['text'].apply(
    lambda x: np.mean([len(sent) for sent in tokenize.sent_tokenize(x)])
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seven_text_data['mean_sent_length'] = seven_text_data['text'].apply(


In [11]:
def normalize(text):
    # Replace with whitespace to separate '😃\n\nFor'
    text = text.replace(r"\n", r" ")
    text = text.replace(r"\r", r" ")
    # Drop punctuation
    text = re.sub(r"\p{P}", " ", text)
    # Remove extra spaces from '😃  For' to '😃 For'
    text = re.sub(r"\s+", r" ", text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

normalized_seven_text_data = seven_text_data.copy()
normalized_seven_text_data['text'] = seven_text_data['text'].apply(lambda x: normalize(x))


In [14]:
eval_data['text'] = eval_data['text'].apply(lambda x: normalize(x))

### Handling class imbalance 

#### Sub-sampling

In [15]:
normalized_seven_text_data.sum(numeric_only=True)['label']/normalized_seven_text_data.shape[0]

0.30317848410757947

Only 30% of the data is AI generated, so we remove instances of persuade corpus  to reach 50/50

In [16]:
dropped_persuade_index = normalized_seven_text_data[
    normalized_seven_text_data['source']=='persuade_corpus'
                                                    ].sample(n=8080, random_state=1).index

In [17]:
sub_sampled_text_data = normalized_seven_text_data.drop(dropped_persuade_index).reset_index(drop=True)

In [18]:
sub_sampled_text_data.sum(numeric_only=True)['label']/sub_sampled_text_data.shape[0]

0.5012126111560227

### TF-IDF

After going through the corpus we see \n to jump lines and  \ before ' that we have to manage for text processing.

In [19]:
corpus = sub_sampled_text_data['text']
vectorizer = TfidfVectorizer(max_features = 10000,
                            stop_words = 'english',
                            )

vectorizer2 = TfidfVectorizer(ngram_range=(1, 4),
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             stop_words='english',
                             token_pattern=None,
                             strip_accents='unicode',
                             sublinear_tf=True,
                             max_features=50000
                             )

X = vectorizer2.fit_transform(corpus,
                            )

In [20]:
X_eval = vectorizer2.transform(eval_data['text'])

In [21]:
features = vectorizer2.get_feature_names_out()

In [22]:
features

array(['0', '00', '000', ..., '\u200d ️', '️', '️ s'], dtype=object)

In [23]:
print("vectorized corpus dimensions : ", X.shape)
print("corpus dataset dimensions : ", seven_text_data.shape)

vectorized corpus dimensions :  (12370, 50000)
corpus dataset dimensions :  (20450, 9)


Each line of X represents an essay in the corpus, and each column represents a word

In [24]:
# X.mean(axis = 0)

## Machine learning task

In [25]:
y = sub_sampled_text_data['label'].values
y.shape

(12370,)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

### Logistic regression

In [27]:
reg = linear_model.LogisticRegression()

In [28]:
reg.fit(X_train,y_train)

In [29]:
def get_scores(classifier, X_test, y_test):
    metrics_dict = {}
    y_pred = classifier.predict(X_test)
    metrics_dict['f1'] = metrics.f1_score(y_test,y_pred)
    metrics_dict['auc'] = roc_auc_score(y_test,classifier.predict_proba(X_test)[:, 1])
    return metrics_dict

In [30]:
y_pred = reg.predict(X_test)

In [31]:
metrics.f1_score(y_test,y_pred)

0.9941596365996107

In [32]:
roc_auc_score(y_test,reg.predict_proba(X_test)[:, 1])

0.9999142854753602

### SVM

This method is extremely slow with a high number of features

In [33]:
# svc = SVC(gamma='auto',max_iter=1000, tol=1e-3, probability = True)
# svc.fit(X_train,y_train)

In [34]:
# svc.predict(X_test)

### Random Forest

In [35]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [36]:
get_scores(rf, X_test, y_test)

{'f1': 0.9908913467794405, 'auc': 0.9995760266927574}

### Gradient Boosting

In [37]:
# boost = XGBClassifier()
# boost.fit(X_train, y_train)


In [38]:
# get_scores(boost, X_test, y_test)

### Voting classifier

## Classifier test for Kaggle

In [39]:
rf = RandomForestClassifier()
rf.fit(X,y) # taking all data

In [40]:
print(X_eval)




In [41]:
y_eval_pred = rf.predict(X_eval)
y_eval_pred_proba = rf.predict_proba(X_eval)

In [42]:
y_eval_pred_proba[:,1]

array([0.21, 0.21, 0.21])

In [43]:
eval_data['id']

0    0000aaaa
1    1111bbbb
2    2222cccc
Name: id, dtype: object

In [44]:

d = {'id' : eval_data['id'], 'generated': y_eval_pred_proba[:,1]}
df_submission = pd.DataFrame(data=d)
df_submission.head()



Unnamed: 0,id,generated
0,0000aaaa,0.21
1,1111bbbb,0.21
2,2222cccc,0.21


In [45]:
df_submission.to_csv("submission.csv", index=False)