In [1]:
import numpy as np
import nltk
import pandas as pd
from datasets import load_dataset
import re
import string
from bs4 import BeautifulSoup
import sklearn

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import scipy.stats as stats
#import spacy

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\color\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\color\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\color\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = load_dataset('artem9k/ai-text-detection-pile')
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})

In [3]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


In [4]:
df.source.value_counts()

source
human    1028146
ai        364376
Name: count, dtype: int64

## Preprocessing

In [5]:
# functions for preprocessing
def remove_urls(text):
    return re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", " ", text) # regex taken from https://www.geeksforgeeks.org/python-check-url-string/

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

def lemmatizer(text):
    tokens = nltk.word_tokenize(text)
    l = nltk.stem.WordNetLemmatizer()
    tokens = [l.lemmatize(token) for token in tokens]
    return " ".join(tokens)

def tokenize_pre_process(text): # for preprocessing using this link: https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/
    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stop words
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]

    # remove top 10% most frequent words 
    fdist = nltk.FreqDist(tokens)
    tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

    # stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # eliminate punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    return tokens

In [6]:
def preprocess_text(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)

    return text

In [7]:
def preprocess_text2(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)

    # lemmatize words
    text = lemmatizer(text)
    
    return text

## Feature Engineering

### Count Vectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vec = CountVectorizer(max_df=0.9,min_df=0.1)
X = vec.fit_transform(df.text[:1000])

In [24]:
X.toarray().shape

(1000, 615)

In [9]:
vec.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'about', 'above', 'access',
       'according', 'achieve', 'across', 'act', 'action', 'actions',
       'activities', 'activity', 'addition', 'additional', 'additionally',
       'address', 'affect', 'affected', 'affects', 'after', 'against',
       'age', 'al', 'all', 'allow', 'allowed', 'allows', 'almost',
       'already', 'also', 'although', 'always', 'america', 'american',
       'americans', 'among', 'an', 'analysis', 'another', 'any',
       'approach', 'approaches', 'appropriate', 'are', 'area', 'areas',
       'around', 'article', 'aspect', 'aspects', 'associated', 'at',
       'attention', 'author', 'available', 'avoid', 'back', 'based',
       'basis', 'be', 'became', 'because', 'become', 'becomes', 'been',
       'before', 'behavior', 'being', 'believe', 'benefits', 'best',
       'better', 'between', 'black', 'body', 'both', 'business', 'but',
       'by', 'can', 

In [10]:
vec2 = CountVectorizer(preprocessor=preprocess_text,max_df=0.9,min_df=0.1)
X2 = vec2.fit_transform(df.text[:1000])

In [11]:
vec2.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'access', 'according',
       'achieve', 'across', 'act', 'action', 'actions', 'activities',
       'activity', 'addition', 'additional', 'additionally', 'address',
       'affect', 'affected', 'affects', 'age', 'al', 'allow', 'allowed',
       'allows', 'almost', 'already', 'also', 'although', 'always',
       'america', 'american', 'americans', 'among', 'analysis', 'another',
       'approach', 'approaches', 'appropriate', 'area', 'areas', 'around',
       'article', 'aspect', 'aspects', 'associated', 'attention',
       'author', 'authors', 'available', 'avoid', 'back', 'based',
       'basis', 'became', 'become', 'becomes', 'behavior', 'believe',
       'benefits', 'best', 'better', 'body', 'business', 'care', 'case',
       'cases', 'cause', 'caused', 'causes', 'central', 'century',
       'certain', 'challenges', 'change', 'changes', 'characteristics',
       'children',

In [12]:
vec3 = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
X3 = vec3.fit_transform(df.text[:1000])

In [13]:
vec3.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'access', 'according',
       'account', 'achieve', 'across', 'act', 'action', 'activity',
       'addition', 'additional', 'additionally', 'address', 'advantage',
       'affect', 'affected', 'age', 'aim', 'al', 'allow', 'allowed',
       'allows', 'almost', 'already', 'also', 'although', 'always',
       'america', 'american', 'among', 'amount', 'analysis', 'another',
       'application', 'approach', 'appropriate', 'area', 'around',
       'article', 'aspect', 'assessment', 'associated', 'attention',
       'attitude', 'author', 'authority', 'available', 'avoid', 'back',
       'background', 'based', 'basis', 'became', 'become', 'becomes',
       'behavior', 'being', 'belief', 'believe', 'benefit', 'best',
       'better', 'black', 'body', 'book', 'business', 'care', 'case',
       'cause', 'caused', 'center', 'central', 'century', 'certain',
       'challenge', 'chance', '

In [14]:
vec4 = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1, ngram_range=(2,3))
X4 = vec4.fit_transform(df.text[:1000])
vec4.get_feature_names_out()

array(['al 2020', 'content introduction', 'covid 19', 'essay table',
       'essay table content', 'et al', 'et al 2020', 'research paper',
       'table content', 'table content introduction', 'united state',
       'work cited'], dtype=object)

### TFIDF

In [10]:
def tfidf(data_train, data_test):
    tfidf = TfidfVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
    train = tfidf.fit_transform(data_train)
    test = tfidf.transform(data_test)
    return train, test 

### Doc2Vec

In [11]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [12]:
def doc2vec(data):
    '''
    https://www.geeksforgeeks.org/doc2vec-in-nlp/
    '''
    
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,doc in enumerate(data)]
    # train the Doc2vec model
    model = Doc2Vec(vector_size=20,
                    min_count=2, epochs=50)
    model.build_vocab(tagged_data)
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)

    return model

def get_vectors(doc2vec_model, data):
    # get the document vectors
    return [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in data]

### Experimentation with doc2vec

In [171]:
m=doc2vec(data.text)
m

<gensim.models.doc2vec.Doc2Vec at 0x22782e567f0>

In [174]:
v = get_vectors(m, data.text)

In [183]:
X_train, X_test, t_train, t_test = train_test_split(np.array(data.text), t, test_size=0.2, random_state=0)


In [188]:
X_train.shape

(1113,)

In [189]:
tex = []
for i in range(len(X_train)):
    tex += [preprocess_text2(X_train[i])]

  soup = BeautifulSoup(text, "html.parser")


In [191]:
m = doc2vec(tex)
X_train = np.array(get_vectors(m, tex))
X_train.shape

(1113, 20)

In [192]:
X_test = np.array(get_vectors(m, X_test))
X_test.shape

(279, 20)

## Sampling from the dataset

### Preserving the original proportions of humans and ai

In [13]:
# Reporting the proportion of samples that are ai generated
print('Percent ai:', round(df[df['source'] == 'ai'].shape[0]/df[df['source'] == 'human'].shape[0]*100, 3))

# Taking a stratified sample of 0.1% of the data
# maintaining same proportions of human and ai samples
data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))

Percent ai: 35.44


  data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))


### Randomly sampling 2000 samples each of human and ai sources

In [14]:
# Randomly sampling 2000 human and 2000 ai samples to create dataset with equal proportions
human = df[df.source == "human"]
human = human.sample(2000, random_state=0)
ai = df[df.source == "ai"].sample(2000, random_state=0)
equal = pd.concat([human,ai], ignore_index=True)
equal

Unnamed: 0,source,id,text
0,human,198049,Overview\n\nBatman and Psychology: A Dark and ...
1,human,12919,The Use of Psychedelic Drugs in Treating Depre...
2,human,979845,member the day like it was yesterday. \n My mo...
3,human,73499,"Legislative Branch Power, Its Limits and Expan..."
4,human,44380,Growth and Fall of Vader Corporation Report\n\...
...,...,...,...
3995,ai,1052501,"""Learn Python the Hard Way"" by Zed Shaw\n\t\t..."
3996,ai,1277472,Cape Town - The South African Humanist Associa...
3997,ai,1086015,"We are only days away from the 2016 NFL Draft,..."
3998,ai,1338694,The number of people infected with Zika virus ...


# Run Models

In [16]:
def CI(metric, confidence):
    a,b = stats.t.interval(confidence, 
                         len(metric)-1, 
                         loc=metric.mean(), 
                         scale=metric.std(ddof=1)/np.sqrt(len(metric)))
    return a,b

In [17]:
def evaluation(X_train, X_test, t_train, t_test, model, model_name, confidence=0.95, scoring='accuracy'):

    y_train = model.predict(X_train)
    y_test = model.predict(X_test)
    
    scores = cross_val_score(model,
                             X_train, 
                             t_train, 
                             scoring=scoring, 
                             cv=KFold(10, shuffle=True, random_state=0))
    
    a,b = CI(scores, confidence)
    
    print(f'==================={model_name} Performance=====================')
    print('95% CI = [', a, b, ']')
    print('Train: ', classification_report(t_train, y_train))
    print('Test: ', classification_report(t_test, y_test))

In [18]:
def run_models(data, features):
    # Convert labels into numeric
    t = data.source
    d = {'human' : 0, 'ai' : 1}
    t = t.map(d, na_action='ignore')

    # Split the data into training and test sets
    X_train, X_test, t_train, t_test = train_test_split(np.array(data.text), t, test_size=0.2, random_state=0)
    
    if features == 'tfidf':
        X_train, X_test = tfidf(X_train,X_test)
        X_train = X_train.toarray()
        X_test = X_test.toarray()
    elif features == 'doc2vec':
        train_prep = []
        test_prep = []
        for i in range(len(X_train)):
            train_prep += [preprocess_text2(X_train[i])]
        for i in range(len(X_test)):
            test_prep += [preprocess_text2(X_test[i])]
        
        m = doc2vec(train_prep)
        X_train = np.array(get_vectors(m, train_prep))
        X_test = np.array(get_vectors(m, test_prep))

    # Naive Bayes
    gnb = GaussianNB()
    gnb.fit(X_train, t_train)
    evaluation(X_train, X_test, t_train, t_test, gnb, 'Naive Bayes')

    # Logistic Regression
    lr_pipe = Pipeline([('log_reg', LogisticRegression(solver='saga'))])

    param_grid1 = {'log_reg__C': [0.0001, 0.01, 0.1], 
                   'log_reg__penalty':[None,'l1','l2']}
    
    gs1 = GridSearchCV(lr_pipe, 
                       param_grid=param_grid1,
                       cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                       scoring='accuracy',
                       verbose=1, 
                       n_jobs=-1, 
                       refit=True)
    
    gs1.fit(X_train, t_train)
    lr = gs1.best_estimator_
    evaluation(X_train, X_test, t_train, t_test, lr, 'Logistic Regression')

    # KNN with PCA
    knn_pipe = Pipeline([('pca', PCA()),
                     ('knn', KNeighborsClassifier())])

    param_grid2 = {'pca__n_components': [0.7, 0.8, 0.9], 
                   'knn__n_neighbors': [5,7,9]}
    
    gs2 = GridSearchCV(knn_pipe, 
                       param_grid=param_grid2,
                       cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                       scoring='accuracy',
                       verbose=1, 
                       n_jobs=-1, 
                       refit=True)
    
    gs2.fit(X_train, t_train)
    knn = gs2.best_estimator_
    evaluation(X_train, X_test, t_train, t_test, knn, 'KNN with PCA')

    # KNN without PCA
    knn_pipe = Pipeline([('knn', KNeighborsClassifier())])

    param_grid3 = {'knn__n_neighbors': [5,7,9]}
    
    gs3 = GridSearchCV(knn_pipe, 
                       param_grid=param_grid3,
                       cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                       scoring='accuracy',
                       verbose=1, 
                       n_jobs=-1, 
                       refit=True)
    
    gs3.fit(X_train, t_train)
    knn = gs3.best_estimator_
    evaluation(X_train, X_test, t_train, t_test, knn, 'KNN without PCA')

In [19]:
run_models(data, 'doc2vec')

  soup = BeautifulSoup(text, "html.parser")


95% CI = [ 0.7385418077168382 0.7889356697606396 ]
Train:                precision    recall  f1-score   support

           0       0.82      0.91      0.86       827
           1       0.61      0.41      0.49       286

    accuracy                           0.78      1113
   macro avg       0.71      0.66      0.68      1113
weighted avg       0.76      0.78      0.77      1113

Test:                precision    recall  f1-score   support

           0       0.76      0.97      0.85       201
           1       0.70      0.21      0.32        78

    accuracy                           0.75       279
   macro avg       0.73      0.59      0.58       279
weighted avg       0.74      0.75      0.70       279

Fitting 5 folds for each of 9 candidates, totalling 45 fits




95% CI = [ 0.7556454175284176 0.7987947369117369 ]
Train:                precision    recall  f1-score   support

           0       0.81      0.93      0.87       827
           1       0.65      0.37      0.47       286

    accuracy                           0.79      1113
   macro avg       0.73      0.65      0.67      1113
weighted avg       0.77      0.79      0.77      1113

Test:                precision    recall  f1-score   support

           0       0.77      0.94      0.85       201
           1       0.65      0.28      0.39        78

    accuracy                           0.76       279
   macro avg       0.71      0.61      0.62       279
weighted avg       0.74      0.76      0.72       279

Fitting 5 folds for each of 9 candidates, totalling 45 fits
95% CI = [ 0.7449804547782732 0.8166347318369134 ]
Train:                precision    recall  f1-score   support

           0       0.90      0.89      0.90       827
           1       0.70      0.72      0.71       28

In [20]:
run_models(data, 'tfidf')

  soup = BeautifulSoup(text, "html.parser")


95% CI = [ 0.6712399440273545 0.7373829645955542 ]
Train:                precision    recall  f1-score   support

           0       0.91      0.74      0.82       827
           1       0.51      0.80      0.62       286

    accuracy                           0.75      1113
   macro avg       0.71      0.77      0.72      1113
weighted avg       0.81      0.75      0.77      1113

Test:                precision    recall  f1-score   support

           0       0.85      0.73      0.78       201
           1       0.49      0.68      0.57        78

    accuracy                           0.71       279
   macro avg       0.67      0.70      0.68       279
weighted avg       0.75      0.71      0.72       279

Fitting 5 folds for each of 9 candidates, totalling 45 fits




95% CI = [ 0.7388230386006595 0.7903597155820948 ]
Train:                precision    recall  f1-score   support

           0       0.87      0.92      0.90       827
           1       0.73      0.60      0.66       286

    accuracy                           0.84      1113
   macro avg       0.80      0.76      0.78      1113
weighted avg       0.83      0.84      0.83      1113

Test:                precision    recall  f1-score   support

           0       0.79      0.84      0.81       201
           1       0.50      0.42      0.46        78

    accuracy                           0.72       279
   macro avg       0.64      0.63      0.63       279
weighted avg       0.71      0.72      0.71       279

Fitting 5 folds for each of 9 candidates, totalling 45 fits
95% CI = [ 0.6714867926303487 0.7031431944996385 ]
Train:                precision    recall  f1-score   support

           0       0.85      0.92      0.88       827
           1       0.69      0.55      0.61       28

In [22]:
run_models(equal, 'doc2vec')

  soup = BeautifulSoup(text, "html.parser")


95% CI = [ 0.6811504950323417 0.7350995049676583 ]
Train:                precision    recall  f1-score   support

           0       0.69      0.79      0.73      1593
           1       0.75      0.64      0.69      1607

    accuracy                           0.71      3200
   macro avg       0.72      0.71      0.71      3200
weighted avg       0.72      0.71      0.71      3200

Test:                precision    recall  f1-score   support

           0       0.67      0.85      0.75       407
           1       0.79      0.57      0.66       393

    accuracy                           0.71       800
   macro avg       0.73      0.71      0.70       800
weighted avg       0.73      0.71      0.71       800

Fitting 5 folds for each of 9 candidates, totalling 45 fits
95% CI = [ 0.7576572297165232 0.7898427702834767 ]
Train:                precision    recall  f1-score   support

           0       0.77      0.77      0.77      1593
           1       0.77      0.78      0.78      160

In [21]:
run_models(equal, 'tfidf')

  soup = BeautifulSoup(text, "html.parser")


95% CI = [ 0.7102528541931672 0.7359971458068328 ]
Train:                precision    recall  f1-score   support

           0       0.76      0.70      0.73      1593
           1       0.72      0.78      0.75      1607

    accuracy                           0.74      3200
   macro avg       0.74      0.74      0.74      3200
weighted avg       0.74      0.74      0.74      3200

Test:                precision    recall  f1-score   support

           0       0.74      0.68      0.71       407
           1       0.69      0.75      0.72       393

    accuracy                           0.71       800
   macro avg       0.72      0.71      0.71       800
weighted avg       0.72      0.71      0.71       800

Fitting 5 folds for each of 9 candidates, totalling 45 fits
95% CI = [ 0.7328868279134311 0.7596131720865691 ]
Train:                precision    recall  f1-score   support

           0       0.79      0.72      0.75      1593
           1       0.74      0.82      0.78      160