In [176]:
import pandas as pd
from datascience import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [177]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [249]:
pandas_table = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines = True)
articles = Table.from_df(pandas_table)
headlines = articles.column('headline')
headlines_train = headlines[:int(0.7*len(headlines))]
labels_train = articles.column('is_sarcastic')[:int(0.7*len(headlines))]
headlines_test = headlines[int(0.7*len(headlines)):]
labels_test = articles.column('is_sarcastic')[int(0.7*len(headlines)):]

In [253]:
sw = stopwords.words("english")
stemmer = SnowballStemmer("english")
def clean(textList):
    def split_punctuations(word_list):
        punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
        new_word_list = []
        for word in word_list:
            if word[0] in punctuations:
                new_word_list.append(word[0])
                word = word[1:]
            if word != '':
                if word[-1] in punctuations:
                    punc = word[-1]
                    word = word[:-1]
                    if word != '':
                        new_word_list.append(word)   
                    new_word_list.append(punc)
                else:
                    new_word_list.append(word)
        return new_word_list
    cleaned_textlists = [[stemmer.stem(word) for word in text.split() if word.lower() not in sw] for text in textList]
    cleaned_headlines = [' '.join(split_punctuations(textlist)) for textlist in cleaned_textlists]
    return cleaned_headlines   

def vectorize_fitter(textList):
    cleanTextList = clean(textList)
    vectorizer = CountVectorizer()
    vectorizer.fit(cleanTextList)
    return vectorizer

def vectorize_text(vectorizer, textList):
    return vectorizer.transform(clean(textList)).toarray()

vectorizer = vectorize_fitter(headlines)
features_train = vectorize_text(vectorizer, headlines_train)
features_test = vectorize_text(vectorizer, headlines_test)

In [254]:
clf = GaussianNB()
clf.fit(features_train, labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [255]:
predictions = clf.predict(features_test)
print(accuracy_score(labels_test, predictions))

0.6912415560214302


In [256]:
print(predictions)

[1 0 0 ... 0 1 0]


In [244]:
character_counts = [len(headline) for headline in headlines]
def augmented_predict(clf, features):
    predictions = []
    for i in range(len(features)):
        """
        if character_counts[i] <= 10:
            predictions.append(1)
        elif sum(features[i]) <= 1 or sum(features[i]) >= 30:
            predictions.append(1)
        """
        word_list = get_words(features[i])
        counter = False
        if word_list:
            for word in word_list:
                if word in su:
                    counter = True
        if counter:
            predictions.append(1)
        else:
            predictions.append(clf.predict([features[i]])[0])
    return predictions

In [248]:
words = vectorizer.get_feature_names()
for word in words:
    if len(word) == 1:
        print(word)
word_dict = vectorizer.vocabulary_
def vectorize_string(sentence, vectorizer):
    return vectorizer.transform(clean([sentence])).toarray()[0]
def predict_headline(headline, vectorizer, clf):
    return clf.predict(vectorizer.transform(clean([headline])).toarray())
str_vec = vectorize_string("cat cat cat", vectorizer)
def get_words(str_vec):
    word_list = []
    for i in range(len(str_vec)):
        if str_vec[i]:
            print(words[i])
            word_list.append(words[i])
    return word_list

In [187]:
articles = articles.with_column('word_count', [len(headline.split()) for headline in headlines])
articles = articles.with_column('character_count', [len(headline) for headline in headlines])
word_counts = articles.column('word_count')
character_counts = articles.column('character_count')

In [9]:
new_features_train = [[feature.copy()] for feature in features_train]
new_features_test = [[feature.copy()] for feature in features_test]

In [10]:
for i in range(len(new_features_train)):
    new_features_train[i].pop()
    new_features_train[i] += [word_counts[i], character_counts[i]]
for i in range(len(new_features_test)):
    new_features_test[i].pop()
    new_features_test[i] += [word_counts[22000+i], character_counts[22000+i]]

In [11]:
clf2 = GaussianNB()
clf2.fit(new_features_train, labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
predictions = clf2.predict(new_features_test)
print(accuracy_score(labels_test, predictions))

0.5744070101223749
