In [None]:
import string

import contractions
import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from num2words import num2words

nltk.download("punkt", quiet=True)

from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.utils import get_stop_words

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", 1000)


In [None]:
yahoo_df = pd.read_csv(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_expert_annotations.tsv",
    sep="\t",
)


In [None]:
yahoo_df.head()


In [None]:
def numbers_to_words(text: str) -> str:
    t = text.split()
    for ind, word in enumerate(t):
        if all(c.isdigit() for c in word):
            t[ind] = num2words(word)
        elif (
            len(word) > 2
            and all(c.isdigit() for c in word[:-2])
            and word[-2:] in ["st", "nd", "rd", "th"]
        ):
            t[ind] = num2words(int(word[:-2]), to="ordinal")

    return " ".join(t)


def get_comment_thread(row: pd.Series) -> str:
    if not row["text"]:
        return ""
    if row["text"][-1] not in string.punctuation:
        row["text"] += "."

    if row["commentindex"] != 0:
        parent_df = yahoo_df[yahoo_df.commentid == row["parentid"]]
        if parent_df.shape[0] == 0:
            return row["text"]
        else:
            return f"{parent_df.iloc[0].thread} {row['text']}"
    else:
        return row["text"]


yahoo_df["text"] = (
    yahoo_df["text"]
    .str.replace(r'[^\w\s]+', '', regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
    .str.lower()
    .apply(lambda x: contractions.fix(x, slang=False))
    .apply(numbers_to_words)
)
yahoo_df = yahoo_df.sort_values(by=["commentindex"])
yahoo_df["thread"] = ""
for index, row in yahoo_df.iterrows():
    yahoo_df.at[index, "thread"] = get_comment_thread(row)


In [None]:
LANGUAGE = "english"
NUM_SENTENCES = 3

tokenizer = Tokenizer(LANGUAGE)
stemmer = Stemmer(LANGUAGE)
summarizer = LuhnSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)


def summarize_thread(row: pd.Series) -> str:
    if not row["thread"]:
        return ""
    parser = PlaintextParser.from_string(row["thread"], tokenizer)
    return "".join([x._text for x in summarizer(parser.document, NUM_SENTENCES)])


yahoo_df["summary"] = yahoo_df.apply(summarize_thread, axis=1)


In [None]:
with open(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_train-ids.txt"
) as f:
    train_ids = [int(x) for x in f.read().splitlines()]

with open(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_dev-ids.txt"
) as f:
    dev_ids = [int(x) for x in f.read().splitlines()]

with open(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_test-ids.txt"
) as f:
    test_ids = [int(x) for x in f.read().splitlines()]

train_df = yahoo_df[yahoo_df["sdid"].isin(train_ids)]
dev_df = yahoo_df[yahoo_df["sdid"].isin(dev_ids)]
test_df = yahoo_df[yahoo_df["sdid"].isin(test_ids)]


In [None]:
temp_df = yahoo_df.copy(deep=True)
temp_df["train"] = temp_df["sdid"].isin(train_ids)
temp_df["dev"] = temp_df["sdid"].isin(dev_ids)
temp_df["test"] = temp_df["sdid"].isin(test_ids)

with pd.option_context(
    "display.max_rows",
    None,
):
    print(
        temp_df.groupby(
            [
                "sentiment",
                pd.cut(
                    temp_df.commentindex,
                    [0, 3, 6, 9, 12, 15, 18],
                    include_lowest=True,
                    right=False,
                ),
            ]
        )[["train", "dev", "test"]].sum()
    )


In [None]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300')

In [None]:
def get_average_w2v_vector(row):
    words = row['text'].split()
    return np.sum([model[w] for w in words if w in model], axis=0) / (
        len(words) if words else 1
    )

train_X = train_df.apply(lambda x: get_average_w2v_vector(x), axis=1, result_type="expand")
train_y = train_df['sentiment']

dev_X = dev_df.apply(lambda x: get_average_w2v_vector(x), axis=1, result_type="expand")
dev_y = dev_df['sentiment']

test_X = test_df.apply(lambda x: get_average_w2v_vector(x), axis=1, result_type="expand")
test_y = test_df['sentiment']

print(train_X.shape, train_y.shape)
print(dev_X.shape, dev_y.shape)
print(test_X.shape, test_y.shape)