In [9]:
!cp /volume/fake-news-volume-nfs/tim/news_research/output/user_timeline.json dataset/

In [1]:
import sys
import re
import json
import spacy
import argparse
import pandas as pd
from tqdm import tqdm

from analysis_method import TextRankSummarizer
from util import write_json, read_df, read_stopwords, sentence_to_feature, build_word_dict

## Prepaer the dataset

In [2]:
data_path = "../topic-clustering/outputs/tweets_with_group_ids.csv"
output_path = "outputs/result.json"
stopword_path = "dict/stopwords_en.txt"
text_column = "content"
tweet_id_column = "tweet_id"

In [3]:
nlp = spacy.load("en_core_web_sm")
df = read_df(data_path)
stopwords = read_stopwords(stopword_path)

read data frame from ../topic-clustering/outputs/tweets_with_group_ids.csv
read stopwords from dict/stopwords_en.txt


In [4]:
df['sentences'] = df['content'].apply(lambda v: [s.text for s in nlp(v).sents])

## Extract summary for each group

In [5]:
# convert sentence to features
def build_features(df):
    articles = []
    tweet_ids = []
    for _, row in tqdm(df.iterrows(), desc='filter sentences', total=len(df)):
        sentences = row['sentences']
        if len(sentences) != 0:
            articles.append(sentences)
            tweet_ids.append(row[tweet_id_column])
            
    final_articles, final_tweet_ids = [], []
    for i, article in enumerate(tqdm(articles, desc='convert sentence to feature')):
        sentences = []
        for sent in article:
            tokens = sent.split(' ')
            feature = sentence_to_feature(sent, stopwords)
            if len(feature) > 2:
                sentences.append({'text':sent, 'feature':feature})

        if len(sentences) > 0:
            final_articles.append(sentences)
            final_tweet_ids.append(tweet_ids[i])
    return final_articles, final_tweet_ids

def summary_extraction(final_articles, final_tweet_ids, max_len, topk):
    extractor = TextRankSummarizer(max_len=max_len, allow_selfloop=False)
    result = extractor(final_articles, topk=topk)
    result = [{'sentence':r[0],
               'scores':r[1]
               } for r in result]

    # find tweet that contain summary sentences
    for i, article in enumerate(tqdm(final_articles, desc='find tweet contain summary sentence')):
        for r in result:
            s = r['sentence']
            if any(s in sentence['text'] for sentence in article):
                if r.get('tweet_ids', None) is None:
                    r['tweet_ids'] = []
                r['tweet_ids'].append(final_tweet_ids[i])
    return result

In [6]:
results = {}
topk = 20
max_len = 20
group_size_min_bound = 50

for group_id, subset in df.groupby("group_id"):
    if subset.shape[0] > group_size_min_bound:
        final_articles, final_tweet_ids = build_features(subset)
        results[group_id] = summary_extraction(final_articles, final_tweet_ids, max_len, topk)

filter sentences: 100%|██████████| 290/290 [00:00<00:00, 7626.53it/s]
convert sentence to feature: 100%|██████████| 290/290 [00:00<00:00, 95407.34it/s]
add in article weight: 100%|██████████| 290/290 [00:00<00:00, 1330796.67it/s]
100%|██████████| 42195/42195 [00:00<00:00, 183981.55it/s]
find tweet contain summary sentence: 100%|██████████| 290/290 [00:00<00:00, 53287.84it/s]
filter sentences: 100%|██████████| 66/66 [00:00<00:00, 9040.92it/s]
convert sentence to feature: 100%|██████████| 66/66 [00:00<00:00, 92490.50it/s]
add in article weight: 100%|██████████| 60/60 [00:00<00:00, 425098.38it/s]
100%|██████████| 2080/2080 [00:00<00:00, 388275.06it/s]
find tweet contain summary sentence: 100%|██████████| 60/60 [00:00<00:00, 49171.21it/s]
filter sentences: 100%|██████████| 61/61 [00:00<00:00, 6488.61it/s]
convert sentence to feature: 100%|██████████| 61/61 [00:00<00:00, 97802.96it/s]
add in article weight: 100%|██████████| 55/55 [00:00<00:00, 247783.80it/s]
100%|██████████| 1830/1830 [00:0

## Output the result

In [7]:
write_json(results, output_path)

write data to outputs/result.json
