In [1]:
import numpy as np
import nltk
import pandas as pd
from datasets import load_dataset
import re
import string
from bs4 import BeautifulSoup
import sklearn
#import spacy

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexacole/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = load_dataset('artem9k/ai-text-detection-pile')
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})

In [3]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Reformat Dataset

In [4]:
df['source'].unique()

array(['human', 'ai'], dtype=object)

In [5]:
df['source'] = [1 if x == 'ai' else 0 for x in df['source']]

In [6]:
df.head()

Unnamed: 0,source,id,text
0,0,0,12 Years a Slave: An Analysis of the Film Essa...
1,0,1,20+ Social Media Post Ideas to Radically Simpl...
2,0,2,2022 Russian Invasion of Ukraine in Global Med...
3,0,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,0,4,A Charles Schwab Corporation Case Essay\n\nCha...


## Preprocessing

In [7]:
# functions for preprocessing
def remove_urls(text):
    return re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", " ", text) # regex taken from https://www.geeksforgeeks.org/python-check-url-string/

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_extra_whitespace(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

def lemmatizer(text):
    tokens = nltk.word_tokenize(text)
    l = nltk.stem.WordNetLemmatizer()
    tokens = [l.lemmatize(token) for token in tokens]
    return " ".join(tokens)

# eliminate punctuation
def remove_punctuation(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return " ".join(tokens)

def tokenize_pre_process(text): # for preprocessing using this link: https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/
    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stop words
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]

    # remove top 10% most frequent words 
    fdist = nltk.FreqDist(tokens)
    tokens = [token for token in tokens if fdist[token] < fdist.N() * 0.1]

    # stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # eliminate punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    return tokens

In [8]:
def preprocess_text(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)
    
    # remove punctuation
    text = remove_punctuation(text)

    return text

In [9]:
def preprocess_text2(text):
    # encoding to ascii
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # convert text to lower case
    text = text.lower()

    # remove html tags 
    text = remove_html(text)

    # remove urls 
    text = remove_urls(text)

    # remove extra whitespace
    text = remove_extra_whitespace(text)

    # remove stop words
    text = remove_stop_words(text)
    
    # remove punctuation
    text = remove_punctuation(text)

    # lemmatize words
    text = lemmatizer(text)
    
    return text

## Feature Engineering

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from gensim.models import Word2Vec
from gensim.models import FastText
import gensim.downloader
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [11]:
# Calculate document embeddings
# Extracted on Google
def get_doc_embedding(doc, model):
    vectors = [model.wv[word.lower()] for word in doc.split() if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [12]:
def get_doc_embedding_glove(doc, model):
    vectors = [model[word.lower()] for word in doc.split() if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [13]:
# functions for features

def count_vectorizer(data_train, data_test):
    vec = CountVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
    train = vec.fit_transform(data_train)
    test = vec.transform(data_test)
    return train, test

def tfidf(data_train, data_test):
    tfidf = TfidfVectorizer(preprocessor=preprocess_text2,max_df=0.9,min_df=0.1)
    train = tfidf.fit_transform(data_train)
    test = tfidf.transform(data_test)
    return train, test 

def word2vec_skipgram(data_train, data_test):
    # preprocess and tokenize samples
    train_samples = data_train.text.apply(preprocess_text2)
    train_samples = train_samples.apply(word_tokenize)
    test_samples = data_test.text.apply(preprocess_text2)
    test_samples = test_samples.apply(word_tokenize)
    
    # skipgram word2vec model
    model1 = Word2Vec(sentences=train_samples, min_count=1,vector_size=200, window=5,sg = 1)
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding(texts, model1) for texts in data_train.text])
    test = np.array([get_doc_embedding(texts, model1) for texts in data_test.text])
    
    return train, test 

def word2vec_cbow(data_train, data_test):
    # preprocess and tokenize samples
    train_samples = data_train.text.apply(preprocess_text2)
    train_samples = train_samples.apply(word_tokenize)
    test_samples = data_test.text.apply(preprocess_text2)
    test_samples = test_samples.apply(word_tokenize)
    
    # cbow word2vec model
    model1 = Word2Vec(sentences=train_samples, min_count=1,vector_size=200, window=5,sg=0)
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding(texts, model1) for texts in data_train.text])
    test = np.array([get_doc_embedding(texts, model1) for texts in data_test.text])
    
    return train, test 

def fast_text(data_train, data_test):
    # preprocess and tokenize samples
    train_samples = data_train.text.apply(preprocess_text2)
    train_samples = train_samples.apply(word_tokenize)
    test_samples = data_test.text.apply(preprocess_text2)
    test_samples = test_samples.apply(word_tokenize)
    
    # FastText Model
    model_fasttext = FastText(sentences=train_samples, min_count=1,vector_size=200, window=5)
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding(texts, model_fasttext) for texts in data_train.text])
    test = np.array([get_doc_embedding(texts, model_fasttext) for texts in data_test.text])
    
    return train, test 
    
def glove_twitter_200(data_train, data_test): # FIX ME -- CHECK HOW THIS WORKS WITH DATA NOT PRE-PROCESSED
    # pretrained glove vectors from twitter-200
    glove_vectors = gensim.downloader.load('glove-twitter-200')
    
    # get averaged word vectors for each sample
    train = np.array([get_doc_embedding_glove(texts, glove_vectors) for texts in data_train.text])
    test = np.array([get_doc_embedding_glove(texts, glove_vectors) for texts in data_test.text])
    
    return train, test 



In [14]:
def doc2vec(data, data_test):
    '''
    https://www.geeksforgeeks.org/doc2vec-in-nlp/
    '''
    
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,doc in enumerate(data)]
    # train the Doc2vec model
    model = Doc2Vec(vector_size=20,
                    min_count=2, epochs=50)
    model.build_vocab(tagged_data)
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
     
    # get the document vectors
    document_vectors = [model.infer_vector(
        word_tokenize(doc.lower())) for doc in data]
    document_vectors_test = [model.infer_vector(
        word_tokenize(doc.lower())) for doc in data_test]

    return document_vectors, document_vectors_test

## Models

In [15]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler

In [16]:
def CI(metric, confidence):
    a,b = stats.t.interval(confidence, 
                         len(metric)-1, 
                         loc=metric.mean(), 
                         scale=metric.std(ddof=1)/np.sqrt(len(metric)))
    return a,b

In [17]:
def evaluation(X_train, X_test, t_train, t_test, model, confidence=0.95, scoring='accuracy'):

    y_train = model.predict(X_train)
    y_test = model.predict(X_test)
    
    scores = cross_val_score(model,
                             X_train, 
                             t_train, 
                             scoring=scoring, 
                             cv=KFold(10, shuffle=True, random_state=0))
    
    a,b = CI(scores, confidence)
    
    print('===================Model Performance=====================')
    print('95% CI = [', a, b, ']')
    print('Train: ', classification_report(t_train, y_train))
    print('Test: ', classification_report(t_test, y_test))

In [18]:
# Reporting the proportion of samples that are ai generated
print('Percent ai:', round(df[df['source'] == 1].shape[0]/df[df['source'] == 0].shape[0]*100, 3))

# Taking a stratified sample of 0.1% of the data
# maintaining same proportions of human and ai samples
data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))

Percent ai: 35.44


  data = df.groupby('source', group_keys=False).apply(lambda x: x.sample(frac=0.001, random_state=0))


In [19]:
def generate_train_test_with_selected_features(df, selected_features):
    # Vectorize the documents (using non-preprocessed data)
    v = selected_features(df.text)

    # Set up the data and labels
    X = np.array(v)
    t = data.source
    X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)
    return X_train, X_test, t_train, t_test

### Bayes Classifier

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
X_train, X_test, t_train, t_test = train_test_split(np.array(data.text), np.array(data.source), test_size=0.2, random_state=0)

In [22]:
# Apply Count Vectorizer
X_train_cv, X_test_cv = count_vectorizer(X_train, X_test)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_cv, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [23]:
evaluation(X_train_cv, X_test_cv, t_train, t_test, mnb)

95% CI = [ 0.7332838941344157 0.7941774958269743 ]
Train:                precision    recall  f1-score   support

           0       0.86      0.86      0.86       823
           1       0.60      0.61      0.61       290

    accuracy                           0.79      1113
   macro avg       0.73      0.73      0.73      1113
weighted avg       0.79      0.79      0.79      1113

Test:                precision    recall  f1-score   support

           0       0.82      0.85      0.83       205
           1       0.54      0.49      0.51        74

    accuracy                           0.75       279
   macro avg       0.68      0.67      0.67       279
weighted avg       0.75      0.75      0.75       279



In [24]:
# Apply TFIDF
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [25]:
evaluation(X_train_tfidf, X_test_tfidf, t_train, t_test, mnb)

95% CI = [ 0.7086692187846886 0.7792973191818493 ]
Train:                precision    recall  f1-score   support

           0       0.76      0.98      0.86       823
           1       0.70      0.12      0.21       290

    accuracy                           0.76      1113
   macro avg       0.73      0.55      0.53      1113
weighted avg       0.74      0.76      0.69      1113

Test:                precision    recall  f1-score   support

           0       0.75      0.98      0.85       205
           1       0.60      0.08      0.14        74

    accuracy                           0.74       279
   macro avg       0.67      0.53      0.50       279
weighted avg       0.71      0.74      0.66       279



In [26]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply word2vec skipgram
X_train_skipgram, X_test_skipgram = word2vec_skipgram(data_train, data_test)

# MinMax Scaler on wordvecs to input into mnb
scaler = MinMaxScaler()
X_train_skipgram = scaler.fit_transform(X_train_skipgram)
X_test_skipgram = scaler.transform(X_test_skipgram)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_skipgram, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [27]:
evaluation(X_train_skipgram, X_test_skipgram, t_train, t_test, mnb)

95% CI = [ 0.6562197461810142 0.747271244809977 ]
Train:                precision    recall  f1-score   support

           0       0.77      0.85      0.81       823
           1       0.40      0.29      0.33       290

    accuracy                           0.70      1113
   macro avg       0.59      0.57      0.57      1113
weighted avg       0.68      0.70      0.69      1113

Test:                precision    recall  f1-score   support

           0       0.78      0.90      0.84       205
           1       0.51      0.28      0.37        74

    accuracy                           0.74       279
   macro avg       0.64      0.59      0.60       279
weighted avg       0.71      0.74      0.71       279



In [28]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply word2vec cbow
X_train_cbow, X_test_cbow = word2vec_cbow(data_train, data_test)

# MinMax Scaler on wordvecs to input into mnb
scaler = MinMaxScaler()
X_train_cbow = scaler.fit_transform(X_train_cbow)
X_test_cbow = scaler.transform(X_test_cbow)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_cbow, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [29]:
evaluation(X_train_cbow, X_test_cbow, t_train, t_test, mnb)

95% CI = [ 0.6983136306584501 0.7788581840133648 ]
Train:                precision    recall  f1-score   support

           0       0.75      0.98      0.85       823
           1       0.50      0.06      0.10       290

    accuracy                           0.74      1113
   macro avg       0.62      0.52      0.47      1113
weighted avg       0.68      0.74      0.65      1113

Test:                precision    recall  f1-score   support

           0       0.75      0.99      0.85       205
           1       0.71      0.07      0.12        74

    accuracy                           0.75       279
   macro avg       0.73      0.53      0.49       279
weighted avg       0.74      0.75      0.66       279



In [30]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply fast text
X_train_fast, X_test_fast = fast_text(data_train, data_test)

# MinMax Scaler on wordvecs to input into mnb
scaler = MinMaxScaler()
X_train_fast = scaler.fit_transform(X_train_fast)
X_test_fast = scaler.transform(X_test_fast)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_fast, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [31]:
evaluation(X_train_fast, X_test_fast, t_train, t_test, mnb)

95% CI = [ 0.6761393540161145 0.745289217412457 ]
Train:                precision    recall  f1-score   support

           0       0.81      0.80      0.81       823
           1       0.45      0.46      0.45       290

    accuracy                           0.71      1113
   macro avg       0.63      0.63      0.63      1113
weighted avg       0.71      0.71      0.71      1113

Test:                precision    recall  f1-score   support

           0       0.79      0.79      0.79       205
           1       0.42      0.42      0.42        74

    accuracy                           0.69       279
   macro avg       0.60      0.60      0.60       279
weighted avg       0.69      0.69      0.69       279



In [32]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply glove
X_train_glove, X_test_glove = glove_twitter_200(data_train, data_test)

# MinMax Scaler on wordvecs to input into mnb
scaler = MinMaxScaler()
X_train_glove = scaler.fit_transform(X_train_glove)
X_test_glove = scaler.transform(X_test_glove)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_glove, t_train)

In [33]:
evaluation(X_train_glove, X_test_glove, t_train, t_test, mnb)

95% CI = [ 0.7039818173925388 0.7767453383346171 ]
Train:                precision    recall  f1-score   support

           0       0.74      1.00      0.85       823
           1       0.71      0.02      0.03       290

    accuracy                           0.74      1113
   macro avg       0.73      0.51      0.44      1113
weighted avg       0.74      0.74      0.64      1113

Test:                precision    recall  f1-score   support

           0       0.73      1.00      0.85       205
           1       0.00      0.00      0.00        74

    accuracy                           0.73       279
   macro avg       0.37      0.50      0.42       279
weighted avg       0.54      0.73      0.62       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
# Doc2Vec -- Not Preprocessed
X_train_Doc2Vec, X_test_Doc2Vec = doc2vec(X_train, X_test)

# MinMax Scaler on wordvecs to input into mnb
#scaler = MinMaxScaler()
X_train_Doc2Vec = scaler.fit_transform(X_train_Doc2Vec)
X_test_Doc2Vec = scaler.transform(X_test_Doc2Vec)

# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_Doc2Vec, t_train)

In [35]:
evaluation(X_train_Doc2Vec, X_test_Doc2Vec, t_train, t_test, mnb)

95% CI = [ 0.6990861106892029 0.7798553307522386 ]
Train:                precision    recall  f1-score   support

           0       0.74      1.00      0.85       823
           1       0.00      0.00      0.00       290

    accuracy                           0.74      1113
   macro avg       0.37      0.50      0.43      1113
weighted avg       0.55      0.74      0.63      1113

Test:                precision    recall  f1-score   support

           0       0.73      1.00      0.85       205
           1       0.00      0.00      0.00        74

    accuracy                           0.73       279
   macro avg       0.37      0.50      0.42       279
weighted avg       0.54      0.73      0.62       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
# Apply Count Vectorizer
X_train_cv, X_test_cv = count_vectorizer(X_train, X_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_cv, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [38]:
evaluation(X_train_cv, X_test_cv, t_train, t_test, dt)

95% CI = [ 0.6576408579421741 0.7313713685700524 ]
Train:                precision    recall  f1-score   support

           0       1.00      0.99      1.00       823
           1       0.98      1.00      0.99       290

    accuracy                           1.00      1113
   macro avg       0.99      1.00      0.99      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.81      0.84      0.82       205
           1       0.51      0.46      0.48        74

    accuracy                           0.74       279
   macro avg       0.66      0.65      0.65       279
weighted avg       0.73      0.74      0.73       279



In [39]:
# Apply TFIDF
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_tfidf, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [40]:
evaluation(X_train_tfidf, X_test_tfidf, t_train, t_test, dt)

95% CI = [ 0.6793963586654549 0.741855250086154 ]
Train:                precision    recall  f1-score   support

           0       1.00      0.99      1.00       823
           1       0.98      1.00      0.99       290

    accuracy                           1.00      1113
   macro avg       0.99      1.00      0.99      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.81      0.82      0.82       205
           1       0.49      0.47      0.48        74

    accuracy                           0.73       279
   macro avg       0.65      0.65      0.65       279
weighted avg       0.73      0.73      0.73       279



In [41]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply word2vec skipgram
X_train_skipgram, X_test_skipgram = word2vec_skipgram(data_train, data_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_skipgram, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [42]:
evaluation(X_train_skipgram, X_test_skipgram, t_train, t_test, dt)

95% CI = [ 0.7201853848732156 0.7909632637754334 ]
Train:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       823
           1       1.00      1.00      1.00       290

    accuracy                           1.00      1113
   macro avg       1.00      1.00      1.00      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.85      0.83      0.84       205
           1       0.56      0.61      0.58        74

    accuracy                           0.77       279
   macro avg       0.71      0.72      0.71       279
weighted avg       0.78      0.77      0.77       279



In [43]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply word2vec cbow
X_train_cbow, X_test_cbow = word2vec_cbow(data_train, data_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_cbow, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [44]:
evaluation(X_train_cbow, X_test_cbow, t_train, t_test, dt)

95% CI = [ 0.6949657472184051 0.7353270455743878 ]
Train:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       823
           1       1.00      1.00      1.00       290

    accuracy                           1.00      1113
   macro avg       1.00      1.00      1.00      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.83      0.82      0.83       205
           1       0.53      0.54      0.53        74

    accuracy                           0.75       279
   macro avg       0.68      0.68      0.68       279
weighted avg       0.75      0.75      0.75       279



In [45]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply fast text
X_train_fast, X_test_fast = fast_text(data_train, data_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_fast, t_train)

  soup = BeautifulSoup(text, "html.parser")


In [46]:
evaluation(X_train_fast, X_test_fast, t_train, t_test, dt)

95% CI = [ 0.6912654922156596 0.7336058076556402 ]
Train:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       823
           1       1.00      1.00      1.00       290

    accuracy                           1.00      1113
   macro avg       1.00      1.00      1.00      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.80      0.81      0.80       205
           1       0.45      0.43      0.44        74

    accuracy                           0.71       279
   macro avg       0.62      0.62      0.62       279
weighted avg       0.71      0.71      0.71       279



In [47]:
data_train = pd.DataFrame(X_train, columns=['text']) #pd.DataFrame(list(zip(X_train, t_train)), columns=['text', 'labels'])
data_test = pd.DataFrame(X_test, columns=['text']) #pd.DataFrame(list(zip(X_test, t_test)), columns=['text', 'labels'])

# Apply glove
X_train_glove, X_test_glove = glove_twitter_200(data_train, data_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_glove, t_train)

In [48]:
evaluation(X_train_glove, X_test_glove, t_train, t_test, dt)

95% CI = [ 0.7128869626072183 0.7695615573413017 ]
Train:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       823
           1       1.00      1.00      1.00       290

    accuracy                           1.00      1113
   macro avg       1.00      1.00      1.00      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.87      0.85      0.86       205
           1       0.62      0.65      0.63        74

    accuracy                           0.80       279
   macro avg       0.74      0.75      0.75       279
weighted avg       0.80      0.80      0.80       279



In [49]:
# Doc2Vec -- Not Preprocessed
X_train_Doc2Vec, X_test_Doc2Vec = doc2vec(X_train, X_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_Doc2Vec, t_train)

In [50]:
evaluation(X_train_Doc2Vec, X_test_Doc2Vec, t_train, t_test, dt)

95% CI = [ 0.7220547124355128 0.776593936213136 ]
Train:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       823
           1       1.00      1.00      1.00       290

    accuracy                           1.00      1113
   macro avg       1.00      1.00      1.00      1113
weighted avg       1.00      1.00      1.00      1113

Test:                precision    recall  f1-score   support

           0       0.83      0.90      0.86       205
           1       0.63      0.49      0.55        74

    accuracy                           0.79       279
   macro avg       0.73      0.69      0.71       279
weighted avg       0.78      0.79      0.78       279

