In [1]:
import numpy as np
import nltk
import pandas as pd
from datasets import load_dataset
import re
import string
from bs4 import BeautifulSoup
import sklearn
#import spacy

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = load_dataset('artem9k/ai-text-detection-pile')
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})

In [3]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Reformat Dataset

In [4]:
df['source'].unique()

array(['human', 'ai'], dtype=object)

In [5]:
df['source'] = [1 if x == 'ai' else 0 for x in df['source']]

In [6]:
df.head()

Unnamed: 0,source,id,text
0,0,0,12 Years a Slave: An Analysis of the Film Essa...
1,0,1,20+ Social Media Post Ideas to Radically Simpl...
2,0,2,2022 Russian Invasion of Ukraine in Global Med...
3,0,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,0,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Preprocessing

In [7]:
# functions for preprocessing
def remove_urls(text):
    return re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", " ", text) # regex taken from https://www.geeksforgeeks.org/python-check-url-string/

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

def lemmatizer(text):
    tokens = nltk.word_tokenize(text)
    l = nltk.stem.WordNetLemmatizer()
    tokens = [l.lemmatize(token) for token in tokens]
    return " ".join(tokens)

# eliminate punctuation
def remove_punctuation(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return " ".join(tokens)

def tokenize_pre_process(text): # for preprocessing using this link: https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/
    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stop words
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]

    # remove top 10% most frequent words 
    fdist = nltk.FreqDist(tokens)
    tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

    # stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # eliminate punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    return tokens

In [8]:
def preprocess_text(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)
    
    # remove punctuation
    text = remove_punctuation(text)

    return text

In [9]:
def preprocess_text2(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)
    
    # remove punctuation
    text = remove_punctuation(text)

    # lemmatize words
    text = lemmatizer(text)
    
    return text

## Feature Engineering

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from gensim.models import Word2Vec
from gensim.models import FastText
import gensim.downloader
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [11]:
# Calculate document embeddings
# Extracted on Google
def get_doc_embedding(doc, model):
    vectors = [model.wv[word.lower()] for word in doc.split() if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [12]:
def get_doc_embedding_glove(doc, model):
    vectors = [model[word.lower()] for word in doc.split() if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [40]:
# functions for features

def count_vectorizer(data_train, data_test):
    vec = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
    train = vec.fit_transform(data_train)
    test = vec.transform(data_test)
    return train, test

def tfidf(data_train, data_test):
    tfidf = TfidfVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
    train = tfidf.fit_transform(data_train)
    test = tfidf.transform(data_test)
    return train, test 

def word2vec_skipgram(data_train, data_test):
    # preprocess and tokenize samples
    train_samples = data_train.text.apply(preprocess_text2)
    train_samples2 = train_samples.apply(word_tokenize)
    test_samples = data_test.text.apply(preprocess_text2)
    
    # skipgram word2vec model
    model1 = Word2Vec(sentences=train_samples2, min_count=1,vector_size=200, window=5,sg = 1)
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding(texts, model1) for texts in train_samples])
    test = np.array([get_doc_embedding(texts, model1) for texts in test_samples])
    
    return train, test 

def word2vec_cbow(data_train, data_test):
    # preprocess and tokenize samples
    train_samples = data_train.text.apply(preprocess_text2)
    train_samples2 = train_samples.apply(word_tokenize)
    test_samples = data_test.text.apply(preprocess_text2)
    
    # cbow word2vec model
    model1 = Word2Vec(sentences=train_samples2, min_count=1,vector_size=200, window=5,sg=0)
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding(texts, model1) for texts in train_samples])
    test = np.array([get_doc_embedding(texts, model1) for texts in test_samples])
    
    return train, test 

def fast_text(data_train, data_test):
    # preprocess and tokenize samples
    train_samples = data_train.text.apply(preprocess_text2)
    train_samples2 = train_samples.apply(word_tokenize)
    test_samples = data_test.text.apply(preprocess_text2)
    
    # FastText Model
    model_fasttext = FastText(sentences=train_samples2, min_count=1,vector_size=200, window=5)
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding(texts, model_fasttext) for texts in train_samples])
    test = np.array([get_doc_embedding(texts, model_fasttext) for texts in test_samples])
    
    return train, test 
    
def glove_twitter_200(data_train, data_test):
    # preprocess samples
    train_samples = data_train.text.apply(preprocess_text2)
    test_samples = data_test.text.apply(preprocess_text2)
    
    # pretrained glove vectors from twitter-200
    glove_vectors = gensim.downloader.load('glove-twitter-200')
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding_glove(texts, glove_vectors) for texts in train_samples])
    test = np.array([get_doc_embedding_glove(texts, glove_vectors) for texts in test_samples])
    
    return train, test 



In [14]:
def doc2vec(data, data_test):
    '''
    https://www.geeksforgeeks.org/doc2vec-in-nlp/
    '''
    
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,doc in enumerate(data)]
    # train the Doc2vec model
    model = Doc2Vec(vector_size=20,
                    min_count=2, epochs=50)
    model.build_vocab(tagged_data)
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
     
    # get the document vectors
    document_vectors = [model.infer_vector(
        word_tokenize(doc.lower())) for doc in data]
    document_vectors_test = [model.infer_vector(
        word_tokenize(doc.lower())) for doc in data_test]

    return document_vectors, document_vectors_test

## Models

In [15]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [16]:
def CI(metric, confidence):
    a,b = stats.t.interval(confidence, 
                         len(metric)-1, 
                         loc=metric.mean(), 
                         scale=metric.std(ddof=1)/np.sqrt(len(metric)))
    return a,b

In [17]:
def evaluation(X_train, X_test, t_train, t_test, model, model_name, confidence=0.95, scoring='accuracy'):
    y_train = model.predict(X_train)
    y_test = model.predict(X_test)
    
    scores = cross_val_score(model,
                             X_train, 
                             t_train, 
                             scoring=scoring, 
                             cv=KFold(10, shuffle=True, random_state=0))
    
    a,b = CI(scores, confidence)
    
    print(f'==================={model_name} Performance=====================')
    print('95% CI = [', a, b, ']')
    print('Train: ', classification_report(t_train, y_train))
    print('Test: ', classification_report(t_test, y_test))

In [18]:
def generate_train_test_with_selected_features(df, selected_features):
    # Vectorize the documents (using non-preprocessed data)
    v = selected_features(df.text)

    # Set up the data and labels
    X = np.array(v)
    t = data.source
    X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)
    return X_train, X_test, t_train, t_test

In [19]:
# Reporting the proportion of samples that are ai generated
print('Percent ai:', round(df[df['source'] == 1].shape[0]/df[df['source'] == 0].shape[0]*100, 3))

# Taking a stratified sample of 0.1% of the data
# maintaining same proportions of human and ai samples
data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))

Percent ai: 35.44


  data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))


In [20]:
human = df[df.source == 0]
human = human.sample(2000, random_state=0)
ai = df[df.source == 1].sample(2000, random_state=0)
equal = pd.concat([human,ai], ignore_index=True)
equal

Unnamed: 0,source,id,text
0,0,198049,Overview\n\nBatman and Psychology: A Dark and ...
1,0,12919,The Use of Psychedelic Drugs in Treating Depre...
2,0,979845,member the day like it was yesterday. \n My mo...
3,0,73499,"Legislative Branch Power, Its Limits and Expan..."
4,0,44380,Growth and Fall of Vader Corporation Report\n\...
...,...,...,...
3995,1,1052501,"""Learn Python the Hard Way"" by Zed Shaw\n\t\t..."
3996,1,1277472,Cape Town - The South African Humanist Associa...
3997,1,1086015,"We are only days away from the 2016 NFL Draft,..."
3998,1,1338694,The number of people infected with Zika virus ...


In [46]:
def prep_data(data):
    tex = []
    for i in range(len(data.text)):
        tex += [preprocess_text2(data.text.iloc[i])]
    prepped = data.insert(len(data.columns),'prep_text', tex)
    return prepped

### Bayes Classifier

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
X_train, X_test, t_train, t_test = train_test_split(np.array(equal.text), np.array(equal.source), test_size=0.2, random_state=0)

In [23]:
mnb_pipe = Pipeline([('mnb', MultinomialNB())])
mnb_param = {'mnb__alpha': [0.2, 0.4, 0.6, 0.8, 1], 'mnb__force_alpha': [True, False]}

In [24]:
# Apply Count Vectorizer
X_train_cv, X_test_cv = count_vectorizer(X_train, X_test)
gs3 = GridSearchCV(mnb_pipe, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_cv, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_cv, X_test_cv, t_train, t_test, mnb, 'MNB with Unigrams')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.6976949069861126 0.7373050930138875 ]
Train:                precision    recall  f1-score   support

           0       0.74      0.71      0.73      1593
           1       0.72      0.75      0.74      1607

    accuracy                           0.73      3200
   macro avg       0.73      0.73      0.73      3200
weighted avg       0.73      0.73      0.73      3200

Test:                precision    recall  f1-score   support

           0       0.72      0.69      0.71       407
           1       0.69      0.73      0.71       393

    accuracy                           0.71       800
   macro avg       0.71      0.71      0.71       800
weighted avg       0.71      0.71      0.71       800



In [25]:
# Apply TFIDF
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

gs3 = GridSearchCV(mnb_pipe, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_tfidf, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_tfidf, X_test_tfidf, t_train, t_test, mnb, 'MNB with TF-IDF')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.7239073216747745 0.7460926783252257 ]
Train:                precision    recall  f1-score   support

           0       0.78      0.68      0.73      1593
           1       0.72      0.81      0.76      1607

    accuracy                           0.74      3200
   macro avg       0.75      0.74      0.74      3200
weighted avg       0.75      0.74      0.74      3200

Test:                precision    recall  f1-score   support

           0       0.76      0.69      0.72       407
           1       0.71      0.78      0.74       393

    accuracy                           0.73       800
   macro avg       0.73      0.73      0.73       800
weighted avg       0.74      0.73      0.73       800



In [26]:
mnb_pipe2 = Pipeline([('scaler', MinMaxScaler()),('mnb', MultinomialNB())])

In [39]:
data_train = pd.DataFrame(X_train, columns=['text']) 
data_test = pd.DataFrame(X_test, columns=['text'])

# Apply word2vec skipgram
X_train_skipgram, X_test_skipgram = word2vec_skipgram(data_train, data_test)

gs3 = GridSearchCV(mnb_pipe2, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_skipgram, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_skipgram, X_test_skipgram, t_train, t_test, mnb, 'MNB with word2vec skip-gram')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.7042118449338021 0.7257881550661979 ]
Train:                precision    recall  f1-score   support

           0       0.72      0.71      0.72      1593
           1       0.72      0.72      0.72      1607

    accuracy                           0.72      3200
   macro avg       0.72      0.72      0.72      3200
weighted avg       0.72      0.72      0.72      3200

Test:                precision    recall  f1-score   support

           0       0.74      0.70      0.72       407
           1       0.71      0.75      0.73       393

    accuracy                           0.72       800
   macro avg       0.73      0.73      0.72       800
weighted avg       0.73      0.72      0.72       800



In [41]:
# Apply word2vec cbow
X_train_cbow, X_test_cbow = word2vec_cbow(data_train, data_test)

gs3 = GridSearchCV(mnb_pipe2, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_cbow, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_cbow, X_test_cbow, t_train, t_test, mnb, 'MNB with word2vec CBOW')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.6821208962125406 0.7022541037874596 ]
Train:                precision    recall  f1-score   support

           0       0.68      0.73      0.70      1593
           1       0.71      0.65      0.68      1607

    accuracy                           0.69      3200
   macro avg       0.70      0.69      0.69      3200
weighted avg       0.70      0.69      0.69      3200

Test:                precision    recall  f1-score   support

           0       0.71      0.72      0.72       407
           1       0.71      0.69      0.70       393

    accuracy                           0.71       800
   macro avg       0.71      0.71      0.71       800
weighted avg       0.71      0.71      0.71       800



In [42]:
# Apply fast text
X_train_fast, X_test_fast = fast_text(data_train, data_test)

gs3 = GridSearchCV(mnb_pipe2, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_fast, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_fast, X_test_fast, t_train, t_test, mnb, 'MNB with Fast Text')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.6526113959810843 0.6811386040189155 ]
Train:                precision    recall  f1-score   support

           0       0.66      0.67      0.67      1593
           1       0.67      0.66      0.67      1607

    accuracy                           0.67      3200
   macro avg       0.67      0.67      0.67      3200
weighted avg       0.67      0.67      0.67      3200

Test:                precision    recall  f1-score   support

           0       0.68      0.70      0.69       407
           1       0.68      0.67      0.67       393

    accuracy                           0.68       800
   macro avg       0.68      0.68      0.68       800
weighted avg       0.68      0.68      0.68       800



In [43]:
# Apply glove
X_train_glove, X_test_glove = glove_twitter_200(data_train, data_test)

gs3 = GridSearchCV(mnb_pipe2, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_glove, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_glove, X_test_glove, t_train, t_test, mnb, 'MNB with Glove Twitter 200')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.6955902531535414 0.7112847468464585 ]
Train:                precision    recall  f1-score   support

           0       0.71      0.69      0.70      1593
           1       0.70      0.72      0.71      1607

    accuracy                           0.71      3200
   macro avg       0.71      0.71      0.71      3200
weighted avg       0.71      0.71      0.71      3200

Test:                precision    recall  f1-score   support

           0       0.72      0.70      0.71       407
           1       0.69      0.72      0.70       393

    accuracy                           0.70       800
   macro avg       0.71      0.71      0.70       800
weighted avg       0.71      0.70      0.71       800



In [62]:
data_train = pd.DataFrame(X_train, columns=['text']) 
data_test = pd.DataFrame(X_test, columns=['text'])

In [63]:
# Doc2Vec -- Preprocessed
prep_data(data_train)
prep_data(data_test)

X_train_Doc2Vec, X_test_Doc2Vec = doc2vec(data_train.prep_text, data_test.prep_text)

gs3 = GridSearchCV(mnb_pipe2, 
                   param_grid=mnb_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_Doc2Vec, t_train)
mnb = gs3.best_estimator_
evaluation(X_train_Doc2Vec, X_test_Doc2Vec, t_train, t_test, mnb, 'MNB with doc2vec')

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
95% CI = [ 0.673384419906898 0.754115580093102 ]
Train:                precision    recall  f1-score   support

           0       0.80      0.71      0.75      1593
           1       0.74      0.82      0.78      1607

    accuracy                           0.76      3200
   macro avg       0.77      0.76      0.76      3200
weighted avg       0.77      0.76      0.76      3200

Test:                precision    recall  f1-score   support

           0       0.78      0.73      0.75       407
           1       0.74      0.78      0.76       393

    accuracy                           0.76       800
   macro avg       0.76      0.76      0.76       800
weighted avg       0.76      0.76      0.76       800



### Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier

In [67]:
dt_pipe = Pipeline([('dt', DecisionTreeClassifier())])
# hyperparameters selected taken from https://www.geeksforgeeks.org/how-to-tune-a-decision-tree-in-hyperparameter-tuning/
dt_param = {'dt__max_depth': [10, 20, 30, None],'dt__min_samples_split': [2, 5, 10],'dt__min_samples_leaf': [1, 2, 4]}

In [69]:
# Apply Count Vectorizer
X_train_cv, X_test_cv = count_vectorizer(X_train, X_test)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_cv, t_train)
dt = gs3.best_estimator_
evaluation(X_train_cv, X_test_cv, t_train, t_test, dt, 'DT with Unigrams')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.6657484797758448 0.7017515202241553 ]
Train:                precision    recall  f1-score   support

           0       0.87      0.75      0.81      1593
           1       0.78      0.89      0.83      1607

    accuracy                           0.82      3200
   macro avg       0.83      0.82      0.82      3200
weighted avg       0.83      0.82      0.82      3200

Test:                precision    recall  f1-score   support

           0       0.74      0.67      0.71       407
           1       0.69      0.76      0.72       393

    accuracy                           0.71       800
   macro avg       0.72      0.72      0.71       800
weighted avg       0.72      0.71      0.71       800



In [70]:
# Apply TFIDF
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_tfidf, t_train)
dt = gs3.best_estimator_
evaluation(X_train_tfidf, X_test_tfidf, t_train, t_test, dt, 'DT with TF-IDF')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.6648479725757676 0.7032770274242323 ]
Train:                precision    recall  f1-score   support

           0       0.96      0.87      0.91      1593
           1       0.88      0.96      0.92      1607

    accuracy                           0.92      3200
   macro avg       0.92      0.92      0.92      3200
weighted avg       0.92      0.92      0.92      3200

Test:                precision    recall  f1-score   support

           0       0.71      0.63      0.67       407
           1       0.66      0.73      0.69       393

    accuracy                           0.68       800
   macro avg       0.68      0.68      0.68       800
weighted avg       0.68      0.68      0.68       800



In [74]:
# Apply word2vec skipgram
X_train_skipgram, X_test_skipgram = word2vec_skipgram(data_train, data_test)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_skipgram, t_train)
dt = gs3.best_estimator_
evaluation(X_train_skipgram, X_test_skipgram, t_train, t_test, dt, 'DT with word2vec skip-gram')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.7490946438275777 0.7809053561724223 ]
Train:                precision    recall  f1-score   support

           0       0.94      0.96      0.95      1593
           1       0.96      0.94      0.95      1607

    accuracy                           0.95      3200
   macro avg       0.95      0.95      0.95      3200
weighted avg       0.95      0.95      0.95      3200

Test:                precision    recall  f1-score   support

           0       0.81      0.77      0.79       407
           1       0.77      0.81      0.79       393

    accuracy                           0.79       800
   macro avg       0.79      0.79      0.79       800
weighted avg       0.79      0.79      0.79       800



In [75]:
# Apply word2vec cbow
X_train_cbow, X_test_cbow = word2vec_cbow(data_train, data_test)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_cbow, t_train)
dt = gs3.best_estimator_
evaluation(X_train_cbow, X_test_cbow, t_train, t_test, dt, 'DT with word2vec CBOW')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.7492424934901367 0.7795075065098633 ]
Train:                precision    recall  f1-score   support

           0       0.94      0.93      0.93      1593
           1       0.93      0.94      0.93      1607

    accuracy                           0.93      3200
   macro avg       0.93      0.93      0.93      3200
weighted avg       0.93      0.93      0.93      3200

Test:                precision    recall  f1-score   support

           0       0.81      0.67      0.74       407
           1       0.71      0.84      0.77       393

    accuracy                           0.76       800
   macro avg       0.76      0.76      0.75       800
weighted avg       0.76      0.76      0.75       800



In [76]:
# Apply fast text
X_train_fast, X_test_fast = fast_text(data_train, data_test)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_fast, t_train)
dt = gs3.best_estimator_
evaluation(X_train_fast, X_test_fast, t_train, t_test, dt, 'DT with Fast Text')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.7107364476471649 0.7355135523528349 ]
Train:                precision    recall  f1-score   support

           0       0.82      0.90      0.86      1593
           1       0.89      0.81      0.85      1607

    accuracy                           0.85      3200
   macro avg       0.85      0.85      0.85      3200
weighted avg       0.85      0.85      0.85      3200

Test:                precision    recall  f1-score   support

           0       0.72      0.78      0.75       407
           1       0.75      0.68      0.71       393

    accuracy                           0.73       800
   macro avg       0.73      0.73      0.73       800
weighted avg       0.73      0.73      0.73       800



In [77]:
# Apply glove
X_train_glove, X_test_glove = glove_twitter_200(data_train, data_test)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_glove, t_train)
dt = gs3.best_estimator_
evaluation(X_train_glove, X_test_glove, t_train, t_test, dt, 'DT with Glove Twitter 200')

  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.7136171852921172 0.7420078147078828 ]
Train:                precision    recall  f1-score   support

           0       0.95      0.97      0.96      1593
           1       0.97      0.95      0.96      1607

    accuracy                           0.96      3200
   macro avg       0.96      0.96      0.96      3200
weighted avg       0.96      0.96      0.96      3200

Test:                precision    recall  f1-score   support

           0       0.77      0.79      0.78       407
           1       0.77      0.75      0.76       393

    accuracy                           0.77       800
   macro avg       0.77      0.77      0.77       800
weighted avg       0.77      0.77      0.77       800



In [85]:
data_train = pd.DataFrame(X_train, columns=['text']) 
data_test = pd.DataFrame(X_test, columns=['text'])

In [86]:
# Doc2Vec -- Preprocessed
prep_data(data_train)
prep_data(data_test)

X_train_Doc2Vec, X_test_Doc2Vec = doc2vec(data_train.prep_text, data_test.prep_text)

gs3 = GridSearchCV(dt_pipe, 
                   param_grid=dt_param,
                   cv=KFold(n_splits=5, shuffle=True, random_state=0), 
                   scoring='accuracy',
                   verbose=1, 
                   n_jobs=-1, 
                   refit=True)

gs3.fit(X_train_Doc2Vec, t_train)
dt = gs3.best_estimator_
evaluation(X_train_Doc2Vec, X_test_Doc2Vec, t_train, t_test, dt, 'DT with doc2vec')

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
95% CI = [ 0.7509168289237373 0.7722081710762625 ]
Train:                precision    recall  f1-score   support

           0       0.94      0.90      0.92      1593
           1       0.90      0.94      0.92      1607

    accuracy                           0.92      3200
   macro avg       0.92      0.92      0.92      3200
weighted avg       0.92      0.92      0.92      3200

Test:                precision    recall  f1-score   support

           0       0.74      0.76      0.75       407
           1       0.74      0.72      0.73       393

    accuracy                           0.74       800
   macro avg       0.74      0.74      0.74       800
weighted avg       0.74      0.74      0.74       800

