<a href="https://colab.research.google.com/github/blackBagel/Sarcasm-Detection-Learn/blob/main/sarcasm_detection_simple_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import os
from google.colab import drive
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import spacy
import re
from sklearn import preprocessing, decomposition, model_selection, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.pipeline import Pipeline
import eli5
import matplotlib.pyplot as plt

# Data preparation

### Load the data into a list of strings

In [None]:
DRIVE_PATH = '/gdrive/My Drive/Data Projects/Reddit reimagined'
drive.mount('/gdrive')
os.chdir(DRIVE_PATH)

In [None]:
columns = [
    'label',
    'comment',
    'author',
    'subreddit',
    'score',
    'ups',
    'downs',
    'date',
    'created_utc',
    'parent_comment'
]

In [None]:
full_comments_df = pd.read_csv("train-balanced-sarc.csv", delimiter='\t', names=columns)
full_comments_df = full_comments_df.dropna()

full_comments = full_comments_df['comment'].tolist()
full_labels = full_comments_df['label'].tolist()

# Research Process

## We begin our research by trying out a basic logistic regression model with different feature sets



### First, a relatively primitive count vectorizer of the texts

In [None]:
# We use the same train_test split for this part of the research
x_train, x_test, y_train, y_test = train_test_split(full_comments,
                                                    full_labels,
                                                    random_state=1,
                                                    test_size=0.2,
                                                    shuffle=True)

In [None]:
ctv = CountVectorizer(ngram_range=(1, 2), min_df=2)
lr_ctv_clf = LogisticRegression(solver='saga', C=1, n_jobs=4, verbose=0)
ctv_logit_pipeline = Pipeline([('counter', ctv), 
                                 ('logitRegressor', lr_ctv_clf)])

In [None]:
%%time
ctv_logit_pipeline.fit(x_train, y_train)

In [None]:
predictions = ctv_logit_pipeline.predict_proba(x_test)
print ("logloss: %0.3f " % log_loss(y_test, predictions))
print ("accuracy: %0.3f " % ctv_logit_pipeline.score(x_test, y_test))

### We also test a basic tf-idf vector

In [None]:
# We'll first try to create a simple baseline model
# Our baseline model will be a logistic regression model, which we'll train on the unigram tf_idf vectors of the original comments text 
tfv = TfidfVectorizer()

tfidf_logit_pipeline = Pipeline([('tf_idf', tfv), 
                                 ('logitRegressor', lr_tfv_clf)])
lr_tfv_clf = LogisticRegression(solver='saga')

In [None]:
%%time
tfidf_logit_pipeline.fit(x_train, y_train)

In [None]:
predictions = lr_tfv_clf.predict_proba(x_test)

print ("logloss: %0.3f " % log_loss(y_test, predictions))
print ("accuracy: %0.3f " % lr_tfv_clf.score(x_test, y_test))

### We try a more sophisticated tf-idf vector

In [None]:
# Now let's try to run the same model, but on a more sophisticated tf-idf 
# This vectorizor is trained on both unigrams and bigrams, and only considers words that were used at least twice
tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
lr_tfv_clf = LogisticRegression(solver='saga', C=1, n_jobs=4, verbose=0)
tfidf_logit_pipeline = Pipeline([('tf_idf', tfv), 
                                 ('logitRegressor', lr_tfv_clf)])

In [None]:
%%time
tfidf_logit_pipeline.fit(x_train, y_train)

In [None]:
# This model gave us the best performance for this part of the research
predictions = tfidf_logit_pipeline.predict_proba(x_test)
print ("logloss: %0.3f " % log_loss(y_test, predictions))
print ("accuracy: %0.3f " % tfidf_logit_pipeline.score(x_test, y_test))

In [None]:
# Because it was te best model we tried to understand better what affected his performance
eli5.show_weights(estimator=tfidf_logit_pipeline.named_steps['logitRegressor'],
                  vec=tfidf_logit_pipeline.named_steps['tf_idf'])

### We theorize that certain pos tags could be strong indicators of sarcasm

In [None]:
# !python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'textcat'])

In [None]:
comments_pos_tags = []
print('Parsing pos tags...')
parsed_comments = [nlp(comment) for comment in tqdm(comments)]
for parsed_comment in parsed_comments:
  comment_pos_tags = []
  for word in parsed_comment:
    comment_pos_tags.append(word.tag_)
  comments_pos_tags.append(' '.join(comment_pos_tags))

In [None]:
# Now let's try to run the same model, but on a more sophisticated tf-idf 
tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
lr_tfv_clf = LogisticRegression(solver='saga', C=1, n_jobs=4, verbose=0)
tfidf_logit_pipeline = Pipeline([('tf_idf', tfv), 
                                 ('logitRegressor', lr_tfv_clf)])

# This vectorizor is trained on both unigrams and bigrams, and only considers words that were used at least twice
x_train, x_test, y_train, y_test = train_test_split(comments_pos_tags,
                                                    full_labels,
                                                    random_state=1,
                                                    test_size=0.2,
                                                    shuffle=True)

In [None]:
%%time
tfidf_logit_pipeline.fit(x_train, y_train)

In [None]:
predictions = tfidf_logit_pipeline.predict_proba(x_test)
print ("logloss: %0.3f " % log_loss(y_test, predictions))
print ("accuracy: %0.3f " % tfidf_logit_pipeline.score(x_test, y_test))

# As it seems, using the plain pos tags as replacements of the words, loses some vital information for the classification
eli5.show_weights(estimator=tfidf_logit_pipeline.named_steps['logitRegressor'],
                  vec=tfidf_logit_pipeline.named_steps['tf_idf'])