In [1]:
import numpy as np
import nltk
import pandas as pd
from datasets import load_dataset
import re
import string
from bs4 import BeautifulSoup
import sklearn
#import spacy

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\color\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\color\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\color\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = load_dataset('artem9k/ai-text-detection-pile')
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})

In [3]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Preprocessing

In [4]:
# functions for preprocessing
def remove_urls(text):
    return re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", " ", text) # regex taken from https://www.geeksforgeeks.org/python-check-url-string/

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

def lemmatizer(text):
    tokens = nltk.word_tokenize(text)
    l = nltk.stem.WordNetLemmatizer()
    tokens = [l.lemmatize(token) for token in tokens]
    return " ".join(tokens)

def tokenize_pre_process(text): # for preprocessing using this link: https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/
    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stop words
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]

    # remove top 10% most frequent words 
    fdist = nltk.FreqDist(tokens)
    tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

    # stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # eliminate punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    return tokens

In [5]:
def preprocess_text(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)

    return text

In [6]:
def preprocess_text2(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)

    # lemmatize words
    text = lemmatizer(text)
    
    return text

## Feature Engineering

### TFIDF and Count Vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vec = CountVectorizer(max_df=0.9,min_df=0.1)
X = vec.fit_transform(df.text[:1000])

In [9]:
vec.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'about', 'above', 'access',
       'according', 'achieve', 'across', 'act', 'action', 'actions',
       'activities', 'activity', 'addition', 'additional', 'additionally',
       'address', 'affect', 'affected', 'affects', 'after', 'against',
       'age', 'al', 'all', 'allow', 'allowed', 'allows', 'almost',
       'already', 'also', 'although', 'always', 'america', 'american',
       'americans', 'among', 'an', 'analysis', 'another', 'any',
       'approach', 'approaches', 'appropriate', 'are', 'area', 'areas',
       'around', 'article', 'aspect', 'aspects', 'associated', 'at',
       'attention', 'author', 'available', 'avoid', 'back', 'based',
       'basis', 'be', 'became', 'because', 'become', 'becomes', 'been',
       'before', 'behavior', 'being', 'believe', 'benefits', 'best',
       'better', 'between', 'black', 'body', 'both', 'business', 'but',
       'by', 'can', 

In [10]:
vec2 = CountVectorizer(preprocessor=preprocess_text,max_df=0.9,min_df=0.1)
X2 = vec2.fit_transform(df.text[:1000])

In [11]:
vec2.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'access', 'according',
       'achieve', 'across', 'act', 'action', 'actions', 'activities',
       'activity', 'addition', 'additional', 'additionally', 'address',
       'affect', 'affected', 'affects', 'age', 'al', 'allow', 'allowed',
       'allows', 'almost', 'already', 'also', 'although', 'always',
       'america', 'american', 'americans', 'among', 'analysis', 'another',
       'approach', 'approaches', 'appropriate', 'area', 'areas', 'around',
       'article', 'aspect', 'aspects', 'associated', 'attention',
       'author', 'authors', 'available', 'avoid', 'back', 'based',
       'basis', 'became', 'become', 'becomes', 'behavior', 'believe',
       'benefits', 'best', 'better', 'body', 'business', 'care', 'case',
       'cases', 'cause', 'caused', 'causes', 'central', 'century',
       'certain', 'challenges', 'change', 'changes', 'characteristics',
       'children',

In [12]:
vec3 = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
X3 = vec3.fit_transform(df.text[:1000])

In [13]:
vec3.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'access', 'according',
       'account', 'achieve', 'across', 'act', 'action', 'activity',
       'addition', 'additional', 'additionally', 'address', 'advantage',
       'affect', 'affected', 'age', 'aim', 'al', 'allow', 'allowed',
       'allows', 'almost', 'already', 'also', 'although', 'always',
       'america', 'american', 'among', 'amount', 'analysis', 'another',
       'application', 'approach', 'appropriate', 'area', 'around',
       'article', 'aspect', 'assessment', 'associated', 'attention',
       'attitude', 'author', 'authority', 'available', 'avoid', 'back',
       'background', 'based', 'basis', 'became', 'become', 'becomes',
       'behavior', 'being', 'belief', 'believe', 'benefit', 'best',
       'better', 'black', 'body', 'book', 'business', 'care', 'case',
       'cause', 'caused', 'center', 'central', 'century', 'certain',
       'challenge', 'chance', '

In [14]:
vec4 = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1, ngram_range=(2,3))
X4 = vec4.fit_transform(df.text[:1000])
vec4.get_feature_names_out()

array(['al 2020', 'content introduction', 'covid 19', 'essay table',
       'essay table content', 'et al', 'et al 2020', 'research paper',
       'table content', 'table content introduction', 'united state',
       'work cited'], dtype=object)

## Doc2Vec

In [15]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [16]:
def doc2vec(data):
    '''
    https://www.geeksforgeeks.org/doc2vec-in-nlp/
    '''
    
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,doc in enumerate(data)]
    # train the Doc2vec model
    model = Doc2Vec(vector_size=20,
                    min_count=2, epochs=50)
    model.build_vocab(tagged_data)
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
     
    # get the document vectors
    document_vectors = [model.infer_vector(
        word_tokenize(doc.lower())) for doc in data]

    return document_vectors

In [32]:
# Reporting the proportion of samples that are ai generated
print('Percent ai:', round(df[df['source'] == 'ai'].shape[0]/df[df['source'] == 'human'].shape[0]*100, 3))

# Taking a stratified sample of 0.1% of the data
# maintaining same proportions of human and ai samples
data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))

Percent ai: 35.44


  data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))


In [20]:
data.text

1349157    First, go to your laptop browser, type “Contro...
1316016    The National Security Agency's vast spying net...
1358288                          And also why they’re India?
1263158    The Washington Post reports that "President Ob...
1053132     1. Create a balanced, nutritious meal plan.\n...
                                 ...                        
579497     I held down the level, the machine whirring as...
912361     Aric Veers wasn't a betting man, but if he was...
212068     Summit County\n\nPublic Shooting Range\n\nThe\...
894946     n He turned a brilliant shade of scarlet, grab...
Name: text, Length: 1392, dtype: object

In [24]:
preprocess_text2(data.text.iloc[0])

'first , go laptop browser , type control panel search box left , right hand corner , hit program search box right , right hand corner . next , type program search box right , right hand corner , hit uninstall program . , type program search box , right hand corner , hit uninstall . , type program search box , right hand corner , hit uninstall . , type program search box , right hand corner , hit uninstall search box . uninstall program .'

In [27]:
tex = []
for i in range(len(data.text)):
    tex += [preprocess_text2(data.text.iloc[i])]

  soup = BeautifulSoup(text, "html.parser")


In [29]:
len(tex)

1392

In [33]:
data.insert(len(data.columns),'prep_text', tex)
data

Unnamed: 0,source,id,text,prep_text
1349157,ai,1369119,"First, go to your laptop browser, type “Contro...","first , go laptop browser , type control panel..."
1316016,ai,1335978,The National Security Agency's vast spying net...,national security agency 's vast spying networ...
1358288,ai,1378250,And also why they’re India?,also theyre india ?
1263158,ai,1283120,"The Washington Post reports that ""President Ob...",washington post report `` president obama cons...
1053132,ai,1073094,"1. Create a balanced, nutritious meal plan.\n...","1. create balanced , nutritious meal plan . 2...."
...,...,...,...,...
579497,human,594841,"I held down the level, the machine whirring as...","held level , machine whirring coffee ground fe..."
912361,human,931073,"Aric Veers wasn't a betting man, but if he was...","aric veers n't betting man , , 'd laid ugly ba..."
212068,human,212068,Summit County\n\nPublic Shooting Range\n\nThe\...,summit county public shooting range summit cou...
786214,human,803606,nly and without warning - four months ago duri...,nly without warning - four month ago middle th...


## Naive Bayes

In [58]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import scipy.stats as stats

In [36]:
def CI(metric, confidence):
    a,b = stats.t.interval(confidence, 
                         len(metric)-1, 
                         loc=metric.mean(), 
                         scale=metric.std(ddof=1)/np.sqrt(len(metric)))
    return a,b

In [63]:
def evaluation(X_train, X_test, t_train, t_test, model, model_name, confidence=0.95, scoring='accuracy'):

    y_train = gnb.predict(X_train)
    y_test = gnb.predict(X_test)
    
    scores = cross_val_score(model,
                             X_train, 
                             t_train, 
                             scoring=scoring, 
                             cv=KFold(10, shuffle=True, random_state=0))
    
    a,b = CI(scores, confidence)
    
    print(f'==================={model_name} Performance=====================')
    print('95% CI = [', a, b, ']')
    print('Train: ', classification_report(t_train, y_train))
    print('Test: ', classification_report(t_test, y_test))

In [38]:
# Vectorize the documents (using preprocessed data)
v = doc2vec(data.prep_text)

# Set up the data and labels
X = np.array(v)
t = data.source
d = {'human' : 0, 'ai' : 1}
t = t.map(d, na_action='ignore')
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)

# Naive Bayes Classifier
gnb = GaussianNB()
gnb.fit(X_train, t_train)

In [64]:
evaluation(X_train, X_test, t_train, t_test, gnb, 'Naive Bayes')

95% CI = [ 0.7225758672572828 0.7832317260503108 ]
Train:                precision    recall  f1-score   support

           0       0.82      0.88      0.85       827
           1       0.56      0.43      0.49       286

    accuracy                           0.77      1113
   macro avg       0.69      0.66      0.67      1113
weighted avg       0.75      0.77      0.76      1113

Test:                precision    recall  f1-score   support

           0       0.80      0.90      0.85       201
           1       0.63      0.44      0.52        78

    accuracy                           0.77       279
   macro avg       0.72      0.67      0.68       279
weighted avg       0.76      0.77      0.76       279



In [40]:
X.shape

(1392, 20)

## Logistic Regression

In [48]:
lr_pipe = Pipeline([('log_reg', LogisticRegression(solver='saga'))])

param_grid1 = {'log_reg__C': [0.0001, 0.01, 0.1], 
               'log_reg__penalty':[None,'l1','l2']}

gs1 = GridSearchCV(lr_pipe, 
                   param_grid=param_grid1,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

In [49]:
gs1.fit(X_train, t_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




In [50]:
lr = gs1.best_estimator_

In [65]:
evaluation(X_train, X_test, t_train, t_test, lr, 'Logistic Regression')



95% CI = [ 0.7460033027651087 0.7975777783159725 ]
Train:                precision    recall  f1-score   support

           0       0.82      0.88      0.85       827
           1       0.56      0.43      0.49       286

    accuracy                           0.77      1113
   macro avg       0.69      0.66      0.67      1113
weighted avg       0.75      0.77      0.76      1113

Test:                precision    recall  f1-score   support

           0       0.80      0.90      0.85       201
           1       0.63      0.44      0.52        78

    accuracy                           0.77       279
   macro avg       0.72      0.67      0.68       279
weighted avg       0.76      0.77      0.76       279





## K-Nearest Neighbors

In [73]:
knn_pipe = Pipeline([('pca', PCA()),
                     ('knn', KNeighborsClassifier())])

param_grid2 = {'pca__n_components': [0.7, 0.8, 0.9], 
               'knn__n_neighbors': [5,7,9]}

gs2 = GridSearchCV(knn_pipe, 
                   param_grid=param_grid2,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

In [74]:
gs2.fit(X_train, t_train)
knn = gs2.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [75]:
evaluation(X_train, X_test, t_train, t_test, knn, 'KNN')

95% CI = [ 0.7669835543885813 0.8052654803604532 ]
Train:                precision    recall  f1-score   support

           0       0.82      0.88      0.85       827
           1       0.56      0.43      0.49       286

    accuracy                           0.77      1113
   macro avg       0.69      0.66      0.67      1113
weighted avg       0.75      0.77      0.76      1113

Test:                precision    recall  f1-score   support

           0       0.80      0.90      0.85       201
           1       0.63      0.44      0.52        78

    accuracy                           0.77       279
   macro avg       0.72      0.67      0.68       279
weighted avg       0.76      0.77      0.76       279



In [80]:
knn_pipe = Pipeline([('knn', KNeighborsClassifier())])

param_grid3 = {'knn__n_neighbors': [5,7,9]}

gs3 = GridSearchCV(knn_pipe, 
                   param_grid=param_grid3,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train, t_train)
knn = gs3.best_estimator_
evaluation(X_train, X_test, t_train, t_test, knn, 'KNN without PCA')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
95% CI = [ 0.7706235642519221 0.8053899492615915 ]
Train:                precision    recall  f1-score   support

           0       0.82      0.88      0.85       827
           1       0.56      0.43      0.49       286

    accuracy                           0.77      1113
   macro avg       0.69      0.66      0.67      1113
weighted avg       0.75      0.77      0.76      1113

Test:                precision    recall  f1-score   support

           0       0.80      0.90      0.85       201
           1       0.63      0.44      0.52        78

    accuracy                           0.77       279
   macro avg       0.72      0.67      0.68       279
weighted avg       0.76      0.77      0.76       279

