In [1]:
import string

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

nltk.download("punkt", quiet=True)

from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.utils import get_stop_words

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", 1000)


In [2]:
yahoo_df = pd.read_csv(
    "data/yahoo-news-annotated-comments-dataset/ydata-ynacc-v1_0_expert_annotations.tsv",
    sep="\t",
)


In [3]:
def get_comment_thread(row: pd.Series) -> str:
    if row["text"][-1] not in string.punctuation:
        row["text"] += "."

    if row["commentindex"] != 0:
        parent_df = yahoo_df[yahoo_df.commentid == row["parentid"]]
        if parent_df.shape[0] == 0:
            return row["text"]
        else:
            return f"{parent_df.iloc[0].thread} {row['text']}"
    else:
        return row["text"]


yahoo_df = yahoo_df.sort_values(by=["commentindex"])
yahoo_df["thread"] = ""
for index, row in yahoo_df.iterrows():
    yahoo_df.at[index, "thread"] = get_comment_thread(row)


In [6]:
LANGUAGE = "english"
NUM_SENTENCES = 3

tokenizer = Tokenizer(LANGUAGE)
stemmer = Stemmer(LANGUAGE)
summarizer = LuhnSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)


def summarize_thread(row: pd.Series) -> str:
    if not row["thread"]:
        return ""
    parser = PlaintextParser.from_string(row["thread"], tokenizer)
    return "".join([x._text for x in summarizer(parser.document, NUM_SENTENCES)])


yahoo_df["summary"] = yahoo_df.apply(summarize_thread, axis=1)
