In [1]:
import newspaper
from newspaper import Config, Article, Source
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import pandas as pd
import numpy as np
import torch
import torchmetrics
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import gradio as gr
import random

In [2]:
url = 'http://cn.chinadaily.com.cn/'

In [3]:
#use newspaper library to scrape foreign news websites

In [4]:
 def get_category(link,lang):
        foreign_paper = newspaper.build(link, language = lang)
        for category in foreign_paper.category_urls():
            print(category)

In [5]:
get_category(url,'zh')

http://cn.chinadaily.com.cn/
http://gx.chinadaily.com.cn
http://cartoon.chinadaily.com.cn
http://column.chinadaily.com.cn
https://cn.chinadaily.com.cn
https://language.chinadaily.com.cn
http://language.chinadaily.com.cn
http://cn.chinadaily.com.cn
http://kan.chinadaily.com.cn
http://fang.chinadaily.com.cn
http://caijing.chinadaily.com.cn
http://fj.chinadaily.com.cn
http://tech.chinadaily.com.cn
http://world.chinadaily.com.cn
http://pic.chinadaily.com.cn
http://fashion.chinadaily.com.cn
http://www.chinadaily.com.cn
http://cnews.chinadaily.com.cn
http://china.chinadaily.com.cn
https://www.chinadaily.com.cn
http://qiye.chinadaily.com.cn
http://tw.chinadaily.com.cn


In [6]:
def get_text_title(link, lang):
    foreign_paper = newspaper.build(link, language = lang, fetch_images=False, memoize_articles=False)
    article = foreign_paper.articles[random.randint(1,10)]
    article.download()
    article.parse()
    return article.text, article.title

In [7]:
article_text, article_title = get_text_title(url, 'zh')

Building prefix dict from C:\Users\Abdul\anaconda3\lib\site-packages\jieba\dict.txt ...
Loading model from cache C:\Users\Abdul\AppData\Local\Temp\jieba.cache
Loading model cost 0.373002290725708 seconds.
Prefix dict has been built succesfully.


In [8]:
print(article_text)

【习近平关心辽宁汛情，要求确保人民群众生命安全】

16日下午，习近平总书记在辽宁锦州市考察小凌河和女儿河环境综合整治情况时，听取了辽宁今年防汛救灾情况汇报，要求当地各级党委和政府抓细抓实各项防汛救灾措施，妥善安置受灾群众，确保人民群众生命安全，做好灾后恢复重建规划，帮助受灾群众尽早恢复正常生产生活。

【习近平：我们对东北振兴充满信心】

16日下午，在辽宁锦州市东湖森林公园调研时，习近平总书记同正在休闲娱乐的市民亲切交流。习近平指出，中国式现代化是全体人民共同富裕的现代化，不只是少数人富裕，而是要全体人民共同富裕、皆大欢喜。党的十八大以来，党中央实施深入推进东北振兴战略，要继续搞好，加快产业结构调整，适应新时代改革发展要求，我们对东北振兴充满信心。

（文字：人民日报记者 杜尚泽；图片：新华社记者 姚大伟 燕雁 摄）

【责任编辑：舒靓】


In [9]:
print(article_title)

习近平：我们对东北振兴充满信心


In [None]:
#for translation using the Helsinki NLP pretrained model from Hugging Face trained on the OPUS dataset

In [10]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

In [12]:
tokenized_text = tokenizer.prepare_seq2seq_batch([article_text], return_tensors='pt')

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [13]:
translation = model.generate(**tokenized_text)
translated_text = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]

In [14]:
print(translated_text)

On the afternoon of 16 p.m. Xi Jinping, the Secretary-General of Xi Jinping, when he visited the city of Liaoningjin State for the integrated management of the environment of the River Ling and daughter river, heard a report on Liaoning’s disaster response this year, asking the local party committees and the government to take care of all measures to prevent and respond to the disaster, to ensure the safety of the people, to prepare for the recovery and reconstruction of the disaster, and to help the affected population to return to normal productive life as soon as possible. 16 p.m. Xi Jinping, during research on the East Lake Forest Park in Liaoning Jinzhou, Xi Jinjinzhou, the General Secretary of Xi Jinping shared confidence in the Northeast revitalization.


In [15]:
#for summarization using Google Pegasus from HuggingFace

In [16]:
tokenizer_sum = AutoTokenizer.from_pretrained("google/pegasus-xsum")

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")

In [18]:
tran_tokenized_text = tokenizer_sum.prepare_seq2seq_batch([translated_text], return_tensors='pt')

In [19]:
summarization = model.generate(**tran_tokenized_text)
summarized_text = tokenizer_sum.batch_decode(summarization, skip_special_tokens=True)[0]

In [20]:
print(summarized_text)

Xi Jinping, the Secretary-General of Xi Jinping, when he visited Liaoning State for the integrated management of the environment of the environment, heard a disaster report on Liaoning’s disaster response this year, asking the local party committees of the government to take care of all measures prevent and respond to the disaster, to
