In [16]:
import string

import contractions
import gensim
import gensim.downloader as api
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from num2words import num2words

nltk.download("punkt", quiet=True)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.utils import get_stop_words

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", 1000)


In [2]:
yahoo_df = pd.read_csv(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_expert_annotations.tsv",
    sep="\t",
)


In [3]:
yahoo_df.head()


Unnamed: 0,sdid,commentindex,headline,url,guid,commentid,timestamp,thumbs-up,thumbs-down,text,parentid,constructiveclass,sd_agreement,sd_type,sentiment,tone,commentagreement,topic,intendedaudience,persuasiveness
0,53971,2,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,rjrPtwH5oVVuQnEXX3hf,00003n000000000000000000000000-ed2ae6d0-32ac-471a-b8b2-a718607ee376,1459917444,,,"These things happen , Every job has its dangers.",1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b4249,Constructive,,Positive/respectful,negative,,Disagreement with commenter,Off-topic with article,Reply to a specific commenter,Not persuasive
1,53971,0,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,VaW6HEsuOFUAIBqjw1k~,1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b4249,1459879464,1.0,,"Sad to hear such a bad thing. Very dangerous job working on electricity. One questions though, why did they use a picture the Bates house from Psycho, on a Disney story? Or is that what the Paris Haunted Mansion/Phantom Manor looks like?",,Constructive,,Positive/respectful,mixed,,,Off-topic with article,Broadcast message / general audience,Not persuasive
2,53971,1,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,uwQePj970KaMZuW3~9Q9,00002n000000000000000000000000-1c30b878-b717-4e9a-9872-2ce2906ce783,1459881644,,,Yes..because too many houses in EU look like the original Disney Hunted House so it didn't look scary enough. Bates Motel looks more American and that notion alone scares everyone.,1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b4249,Constructive,,Positive/respectful,neutral,Informative,,Off-topic with article,Reply to a specific commenter,Not persuasive
3,135929,0,This Old Navy Ad Featuring an Interracial Family Is Being Attacked By Racist Trolls,http://mic.com/articles/142323/this-old-navy-ad-featuring-an-interracial-family-is-being-attacked-by-racist-trolls,fixyWJivQjEQtPLLVXsu,1462203719963-3eeffb02-faae-4b51-9174-704c57e6de37,1462203719,18.0,3.0,"I am frankly quite SICK of the phrase ""shoved down our throat"" You know what? Back in the newspaper and three network days you could say that...Now with 300 or more TV channels and an endless internet...You can keep your throat relatively clear of things you don't want...All you have to do is change the channel or click a link to something you DO like... So let's stop it with the ""shoved down our throat"" rhetoric.",,Not constructive,Agreement throughout,Off-topic/digression,negative,Mean,,Off-topic with article,Broadcast message / general audience,Persuasive
4,135929,1,This Old Navy Ad Featuring an Interracial Family Is Being Attacked By Racist Trolls,http://mic.com/articles/142323/this-old-navy-ad-featuring-an-interracial-family-is-being-attacked-by-racist-trolls,_TDnK715vO5y0OzZz_n4,00002I000000000000000000000000-7ef2ac58-bd84-4027-88cf-b865bfe2f1f8,1462204643,7.0,2.0,"Ya, I always wonder why the conservatives are on Yahoo!, whining about all the liberals, when they could be hanging out with their own kind, patting each other on the back, over at Faux News. Don't like what you see, switch the channel. No, they're a bit hypocritical about shoving things down peoples throats, and want to force they're way upon others.",1462203719963-3eeffb02-faae-4b51-9174-704c57e6de37,Not constructive,Agreement throughout,Off-topic/digression,neutral,Sarcastic,Agreement with commenter,Off-topic with article,Reply to a specific commenter,Not persuasive


In [4]:
def numbers_to_words(text: str) -> str:
    t = text.split()
    for ind, word in enumerate(t):
        if all(c.isdigit() for c in word):
            t[ind] = num2words(word)
        elif (
            len(word) > 2
            and all(c.isdigit() for c in word[:-2])
            and word[-2:] in ["st", "nd", "rd", "th"]
        ):
            t[ind] = num2words(int(word[:-2]), to="ordinal")

    return " ".join(t)


def get_comment_thread(row: pd.Series) -> str:
    if not row["text"]:
        return ""
    if row["text"][-1] not in string.punctuation:
        row["text"] += "."

    if row["commentindex"] != 0:
        parent_df = yahoo_df[yahoo_df.commentid == row["parentid"]]
        if parent_df.shape[0] == 0:
            return row["text"]
        else:
            return f"{parent_df.iloc[0].thread} {row['text']}"
    else:
        return row["text"]


yahoo_df["text"] = (
    yahoo_df["text"]
    .str.replace(r"[^\w\s]+", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
    .str.lower()
    .apply(lambda x: contractions.fix(x, slang=False))
    .apply(numbers_to_words)
    .astype(str)
)
yahoo_df = yahoo_df.sort_values(by=["commentindex"])
yahoo_df["thread"] = ""
for index, row in yahoo_df.iterrows():
    yahoo_df.at[index, "thread"] = get_comment_thread(row)


In [5]:
LANGUAGE = "english"
NUM_SENTENCES = 3

tokenizer = Tokenizer(LANGUAGE)
stemmer = Stemmer(LANGUAGE)
summarizer = LuhnSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)


def summarize_thread(row: pd.Series) -> str:
    if not row["thread"]:
        return ""
    parser = PlaintextParser.from_string(row["thread"], tokenizer)
    return "".join([x._text for x in summarizer(parser.document, NUM_SENTENCES)])


yahoo_df["summary"] = yahoo_df.apply(summarize_thread, axis=1)


In [6]:
with open(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_train-ids.txt"
) as f:
    train_ids = [int(x) for x in f.read().splitlines()]

with open(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_dev-ids.txt"
) as f:
    dev_ids = [int(x) for x in f.read().splitlines()]

with open(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_test-ids.txt"
) as f:
    test_ids = [int(x) for x in f.read().splitlines()]

train_df = yahoo_df[yahoo_df["sdid"].isin(train_ids)]
dev_df = yahoo_df[yahoo_df["sdid"].isin(dev_ids)]
test_df = yahoo_df[yahoo_df["sdid"].isin(test_ids)]


In [7]:
temp_df = yahoo_df.copy(deep=True)
temp_df["train"] = temp_df["sdid"].isin(train_ids)
temp_df["dev"] = temp_df["sdid"].isin(dev_ids)
temp_df["test"] = temp_df["sdid"].isin(test_ids)

with pd.option_context(
    "display.max_rows",
    None,
):
    print(
        temp_df.groupby(
            [
                "sentiment",
                pd.cut(
                    temp_df.commentindex,
                    [0, 3, 6, 9, 12, 15, 18],
                    include_lowest=True,
                    right=False,
                ),
            ]
        )[["train", "dev", "test"]].sum()
    )


                        train  dev  test
sentiment commentindex                  
mixed     [0, 3)         1340  107    86
          [3, 6)         1006   88    35
          [6, 9)          507   27    24
          [9, 12)         189    4     5
          [12, 15)         67    1     0
          [15, 18)          6    0     0
negative  [0, 3)         4764  388   340
          [3, 6)         3541  277   233
          [6, 9)         1407   53    94
          [9, 12)         544   17    13
          [12, 15)        167    0     0
          [15, 18)         11    0     0
neutral   [0, 3)         2278  208   102
          [3, 6)         1873  155    81
          [6, 9)          724   52    24
          [9, 12)         281   12     2
          [12, 15)         90    0     0
          [15, 18)          5    0     0
positive  [0, 3)          685   47    56
          [3, 6)          468   38    32
          [6, 9)          171   15     6
          [9, 12)          48    2     4
          [12, 1

In [8]:
model = api.load("word2vec-google-news-300")


In [23]:
def get_average_w2v_vector(row):
    words = row["text"].split()
    thread_words = row["thread"].split()

    avg_text_w2v = np.sum([model[w] for w in words if w in model], axis=0) / (
        len(words) if words else 1
    )

    if avg_text_w2v.shape != (300,):
        avg_text_w2v = np.zeros((300,))

    avg_thread_w2v = np.sum([model[w] for w in thread_words if w in model], axis=0) / (
        len(thread_words) if thread_words else 1
    )

    if avg_thread_w2v.shape != (300,):
        avg_thread_w2v = np.zeros((300,))

    embedding = np.concatenate((avg_text_w2v, avg_thread_w2v), axis=None)
    return embedding

f1_scores = pd.DataFrame(columns=['Target Column', 'Dev Dataset F1', 'Test Dataset F1'])

for target_col in [
    "persuasiveness",
    "intendedaudience",
    "topic",
    "tone",
    "constructiveclass",
]:

    train_X = train_df.apply(
        lambda x: get_average_w2v_vector(x), axis=1, result_type="expand"
    )
    dev_X = dev_df.apply(
        lambda x: get_average_w2v_vector(x), axis=1, result_type="expand"
    )
    test_X = test_df.apply(
        lambda x: get_average_w2v_vector(x), axis=1, result_type="expand"
    )

    train_y = train_df[target_col]
    dev_y = dev_df[target_col]
    test_y = test_df[target_col]

    train_X = train_X[train_y.notna()]
    train_y = train_y[train_y.notna()]

    dev_X = dev_X[dev_y.notna()]
    dev_y = dev_y[dev_y.notna()]

    test_X = test_X[test_y.notna()]
    test_y = test_y[test_y.notna()]

    clf = LogisticRegression(max_iter=10000)
    clf.fit(train_X, train_y)

    pred_dev_y = clf.predict(dev_X)
    pred_test_y = clf.predict(test_X)

    print(f"{target_col} : ")
    print(f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0))
    print(f1_score(test_y, pred_test_y, average='weighted', zero_division=0))
    f1_scores.loc[len(f1_scores.index)] = [target_col, f1_score(dev_y, pred_dev_y, average='weighted', zero_division=0), f1_score(test_y, pred_test_y, average='weighted', zero_division=0)] 


persuasiveness : 
0.7257473091130282
0.7521520819431557
intendedaudience : 
0.5646423242721982
0.6526634767180163
topic : 
0.5973614756428787
0.675730583993962
tone : 
0.2562503964198996
0.3662351613881465
constructiveclass : 
0.5726089898169763
0.5843001461585312


In [26]:
print(f1_scores.to_latex(index=False))

ImportError: Missing optional dependency 'Jinja2'. DataFrame.style requires jinja2. Use pip or conda to install Jinja2.