In [9]:
import numpy as np
import nltk
import pandas as pd
from datasets import load_dataset
import re
import string
from bs4 import BeautifulSoup
#import spacy

In [11]:
dataset = load_dataset('artem9k/ai-text-detection-pile')
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})

In [12]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Reformat Dataset

In [15]:
df['source'].unique()

array(['human', 'ai'], dtype=object)

In [16]:
df['source'] = [1 if x == 'ai' else 0 for x in df['source']]

In [17]:
df.head()

Unnamed: 0,source,id,text
0,0,0,12 Years a Slave: An Analysis of the Film Essa...
1,0,1,20+ Social Media Post Ideas to Radically Simpl...
2,0,2,2022 Russian Invasion of Ukraine in Global Med...
3,0,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,0,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Data Exploration

In [19]:
df.groupby(['source']).size()

source
0    1028146
1     364376
dtype: int64

We have unequal samples for each class. We will most likely have to resample based on the methods we intend to do that may require equal class sizes. We can figure that out later.

#### TBD: Work on more data exploration focusing on the content of the text if time permits

In [14]:
# checking existance of any html tags. Reg expression does not detect just html tags so will not remove these tags for fear of losing valuable text within
# using beautiful soup to remove tags
count = 0
for idx, row in df.iterrows():
    if re.findall('<.*?>', row['text']) != []:
        print(re.findall('<[^>]+>', row['text']))
        count = count + 1
        if count == 20:
            break

['<LongWritable,Text,Text,IntWritable>', '<Text,IntWritable,Text,IntWritable>']
['<…>', '<…>']
['<10 w, 10-1kw, >', '<80°C, 80°- 500°C, >']
['< anArray [mid]). Base cases for binary search could be both first >']
['<95th) percentiles (“Georgia,” n.d.). Consequently, Georgia is ranked 14 out of 50 states with high obesity (“Georgia,” n.d.). Moreover, approximately 77% of children in Georgia have High BMI rates (Helland & Nordbotten, 2021). Nutrition assistance programs are considered healthy as they minimize the risk of increased body weight, overweight, or obesity. Reducing calorie-dense foods among African American and Hispanic communities will minimize excessive caloric intake.\n\nHealth Disparities and Inequalities in Georgia\n\nDisparities in access to healthy foods follow ethnic, racial, socio-economic status, and geographic location. The inexpensive nature of calorie-rich foods contributes to poor eating habits in urban areas. For example, in 2019, Georgian children living below 

In [25]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


for idx, row in df.iterrows():
    if re.findall('<.*?>', row['text']) != []:
        print(re.findall('<[^>]+>', row['text']))
        soup = BeautifulSoup(row['text'], "html.parser")
        print(soup.get_text() == row['text'])
        
        if count == 20:
            break

['<LongWritable,Text,Text,IntWritable>', '<Text,IntWritable,Text,IntWritable>']
False
['<…>', '<…>']
True
['<10 w, 10-1kw, >', '<80°C, 80°- 500°C, >']
True
['< anArray [mid]). Base cases for binary search could be both first >']
True
['<95th) percentiles (“Georgia,” n.d.). Consequently, Georgia is ranked 14 out of 50 states with high obesity (“Georgia,” n.d.). Moreover, approximately 77% of children in Georgia have High BMI rates (Helland & Nordbotten, 2021). Nutrition assistance programs are considered healthy as they minimize the risk of increased body weight, overweight, or obesity. Reducing calorie-dense foods among African American and Hispanic communities will minimize excessive caloric intake.\n\nHealth Disparities and Inequalities in Georgia\n\nDisparities in access to healthy foods follow ethnic, racial, socio-economic status, and geographic location. The inexpensive nature of calorie-rich foods contributes to poor eating habits in urban areas. For example, in 2019, Georgian c

In [15]:
# checking existance of any urls
count = 0
for idx, row in df.iterrows():
    if re.findall('https?://\S+|www\.\S+', row['text']) != []:
        print(re.findall('https?://\S+|www\.\S+', row['text']))
        count = count + 1
        if count == 20:
            break

  if re.findall('https?://\S+|www\.\S+', row['text']) != []:
  print(re.findall('https?://\S+|www\.\S+', row['text']))


['https://www.youtube.com/watch?v=j71Kmxv7smk', 'https://www.icas.com/students/learning-blog/test-of-competence/financial-accounting-whats-the-dealclip-with-debits-and-credits']
['www.intechopen.com.']
['https://doi.org/10.1108/JFC-04-2020-0055']
['https://www.nike.com/experiences/details/140585', 'https://www.facebook.com/nike/videos/353688522272944/', 'https://www.launchmetrics.com/resources/blog/nike-data-analysis']
['www.youth.gov,']
['www.aplaceformom.com.', 'www.hhs.gov']
['https://www.fireengineering.com/firefighting/a-guide-to-selecting-the-attack-line/#gref']
['https://www.youtube.com/watch?v=0jltioeaEyY']
['https://www.facebook.com/profile.php?id=100074386628222', 'https://twitter.com/account/access?did_not_receive=true']
['https://www.youtube.com/watch?v=Yqkt54B-JIc']
['www.redoliveculture.com.', 'www.americansforthearts.org.', 'www.indiegogo.com.']
['https://adoptioncouncil.org/', 'https://chsfl.org/']
['www.uschamber.com']
['https://www.cnbc.com/2021/01/22/countries-look-t

## Resample Data (current size of data causing pre-processing to take too long to execute)

In [40]:
# FIX THIS WITH ACCURATE RESAMPLING SIZE

'''from sklearn.utils import resample
df_downsample = resample(df,
             replace=True,
             n_samples=364376, # number of ai samples in the dataset
             random_state=42)

print(df_downsample.shape)'''

(364376, 3)


## Pre-processing

In [21]:
nltk.download('punkt_tab')
nltk.download('stopwords')

# functions for preprocessing
def remove_urls(text):
    return re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", " ", text) # regex taken from https://www.geeksforgeeks.org/python-check-url-string/

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

def tokenize_pre_process(text): # for preprocessing using this link: https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/
    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stop words
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]

    # remove top 10% most frequent words 
    fdist = nltk.FreqDist(tokens)
    tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

    # stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # eliminate punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def preprocess_text(df):
    # encoding to ascii
    df['text'] = df['text'].str.encode('ascii', 'ignore').str.decode('ascii')
    
    # convert text to lower case
    df['text'] = df['text'].str.lower()

    # remove html tags 
    df['text'] = df['text'].apply(remove_html)

    # remove urls 
    df['text'] = df['text'].apply(remove_urls)

    # remove extra whitespace
    df['text'] = df['text'].apply(remove_extra_whitespace)

    # tokenization and further normalization (removing punctuation, frequent words, stop words, and stemming
    df['text'] = df['text'].apply(tokenize_pre_process)

    return df

In [23]:
normalized_df = preprocess_text(df[:10000])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.encode('ascii', 'ignore').str.decode('ascii')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(remove_html)
A value is trying to be set on a copy of a slice fro

In [24]:
normalized_df

Unnamed: 0,source,id,text
0,0,0,"[12, year, slave, analysi, film, essay, 2013, ..."
1,0,1,"[20+, social, media, post, idea, radic, simpli..."
