In [1]:
import numpy as np
import nltk
import pandas as pd
from datasets import load_dataset
import re
import string
from bs4 import BeautifulSoup
import sklearn
#import spacy

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexacole/nltk_data...


True

In [2]:
dataset = load_dataset('artem9k/ai-text-detection-pile')
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})

In [3]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Preprocessing

In [7]:
# functions for preprocessing
def remove_urls(text):
    return re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", " ", text) # regex taken from https://www.geeksforgeeks.org/python-check-url-string/

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

def lemmatizer(text):
    tokens = nltk.word_tokenize(text)
    l = nltk.stem.WordNetLemmatizer()
    tokens = [l.lemmatize(token) for token in tokens]
    return " ".join(tokens)

def tokenize_pre_process(text): # for preprocessing using this link: https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/
    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stop words
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]

    # remove top 10% most frequent words 
    fdist = nltk.FreqDist(tokens)
    tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

    # stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # eliminate punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    return tokens

In [8]:
def preprocess_text(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)

    return text

In [9]:
def preprocess_text2(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)

    # lemmatize words
    text = lemmatizer(text)
    
    return text

## Feature Engineering

### TFIDF and Count Vectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vec = CountVectorizer(max_df=0.9,min_df=0.1)
X = vec.fit_transform(df.text[:1000])

In [14]:
vec.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'about', 'above', 'access',
       'according', 'achieve', 'across', 'act', 'action', 'actions',
       'activities', 'activity', 'addition', 'additional', 'additionally',
       'address', 'affect', 'affected', 'affects', 'after', 'against',
       'age', 'al', 'all', 'allow', 'allowed', 'allows', 'almost',
       'already', 'also', 'although', 'always', 'america', 'american',
       'americans', 'among', 'an', 'analysis', 'another', 'any',
       'approach', 'approaches', 'appropriate', 'are', 'area', 'areas',
       'around', 'article', 'aspect', 'aspects', 'associated', 'at',
       'attention', 'author', 'available', 'avoid', 'back', 'based',
       'basis', 'be', 'became', 'because', 'become', 'becomes', 'been',
       'before', 'behavior', 'being', 'believe', 'benefits', 'best',
       'better', 'between', 'black', 'body', 'both', 'business', 'but',
       'by', 'can', 

In [15]:
vec2 = CountVectorizer(preprocessor=preprocess_text,max_df=0.9,min_df=0.1)
X2 = vec2.fit_transform(df.text[:1000])

In [16]:
vec2.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'access', 'according',
       'achieve', 'across', 'act', 'action', 'actions', 'activities',
       'activity', 'addition', 'additional', 'additionally', 'address',
       'affect', 'affected', 'affects', 'age', 'al', 'allow', 'allowed',
       'allows', 'almost', 'already', 'also', 'although', 'always',
       'america', 'american', 'americans', 'among', 'analysis', 'another',
       'approach', 'approaches', 'appropriate', 'area', 'areas', 'around',
       'article', 'aspect', 'aspects', 'associated', 'attention',
       'author', 'authors', 'available', 'avoid', 'back', 'based',
       'basis', 'became', 'become', 'becomes', 'behavior', 'believe',
       'benefits', 'best', 'better', 'body', 'business', 'care', 'case',
       'cases', 'cause', 'caused', 'causes', 'central', 'century',
       'certain', 'challenges', 'change', 'changes', 'characteristics',
       'children',

In [17]:
vec3 = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
X3 = vec3.fit_transform(df.text[:1000])

In [18]:
vec3.get_feature_names_out()

array(['10', '11', '12', '15', '19', '2017', '2018', '2019', '2020',
       '2021', '2022', 'ability', 'able', 'access', 'according',
       'account', 'achieve', 'across', 'act', 'action', 'activity',
       'addition', 'additional', 'additionally', 'address', 'advantage',
       'affect', 'affected', 'age', 'aim', 'al', 'allow', 'allowed',
       'allows', 'almost', 'already', 'also', 'although', 'always',
       'america', 'american', 'among', 'amount', 'analysis', 'another',
       'application', 'approach', 'appropriate', 'area', 'around',
       'article', 'aspect', 'assessment', 'associated', 'attention',
       'attitude', 'author', 'authority', 'available', 'avoid', 'back',
       'background', 'based', 'basis', 'became', 'become', 'becomes',
       'behavior', 'being', 'belief', 'believe', 'benefit', 'best',
       'better', 'black', 'body', 'book', 'business', 'care', 'case',
       'cause', 'caused', 'center', 'central', 'century', 'certain',
       'challenge', 'chance', '