# INFERENCE
- load news
- select important ones (via model)
- evaluate news - positive vs negative

### constants and imports

In [1]:
# check env - google colab or local
import sys

def check_environment():
    if 'google.colab' in sys.modules:
        # Running in Google Colab
        return "Google Colab"

    if hasattr(sys, 'prefix'):
        if sys.prefix.startswith('/usr/local'):
            # Running in a Colab-like environment (could be Colab or another cloud environment)
            return "Colab-like environment"
        else:
            # Running in a local environment
            return "Local environment"

    # Default to local environment if checks are inconclusive
    return "Local environment"

In [2]:
# set paths and install packages depending on the environment

if check_environment() == 'Google Colab':
    !pip install --quiet sentence_transformers transformers sumy nltk gnews newspaper3k

    from google.colab import drive
    drive.mount('/content/drive')

    path = '/content/drive/MyDrive/Colab Notebooks/other/econ_news(Kulbaka)/'
    data_path = path+'data/'
    model_path = path+'news_models/'

else:
    data_path = '../data/'
    model_path = '../news_models/'

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m41.9 MB/s[0m eta 

In [20]:
from gnews import GNews
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import nltk
from sumy.summarizers.lsa import LsaSummarizer

import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm_notebook

# Load news (incl full texts)

In [4]:
# download news day by day (due to 100 limit) via combining dictionaries
start_date = '2023-10-10'
end_date = '2023-10-20'

rus_econ_news = []
for date in tqdm_notebook(pd.date_range(start_date, end_date)):
    google_news = GNews(language='ru', country='RU', start_date=date - pd.Timedelta(days=1), end_date=date)
    rus_econ_news += google_news.get_news('Экономика России')

100%|██████████| 11/11 [02:22<00:00, 12.95s/it]


In [5]:
# get full articles following the links
for news in tqdm_notebook(rus_econ_news):
    try:
        # get the full article
        news['full_article'] = google_news.get_full_article(news['url']).text
    except:
        news['full_article'] = ''

  0%|          | 1/1031 [00:02<45:13,  2.63s/it]ERROR:gnews.gnews:Article `download()` failed with 403 Client Error: Forbidden for url: https://www.unian.net/economics/finance/ekonomika-rf-okazalas-gorazdo-bolee-ustoychivoy-k-sankciyam-chem-dumali-na-zapade-nyt-12419931.html on URL https://news.google.com/rss/articles/CBMiiwFodHRwczovL3d3dy51bmlhbi5uZXQvZWNvbm9taWNzL2ZpbmFuY2UvZWtvbm9taWthLXJmLW9rYXphbGFzLWdvcmF6ZG8tYm9sZWUtdXN0b3ljaGl2b3ktay1zYW5rY2l5YW0tY2hlbS1kdW1hbGktbmEtemFwYWRlLW55dC0xMjQxOTkzMS5odG1s0gGfAWh0dHBzOi8vd3d3LnVuaWFuLm5ldC9lY29ub21pY3MvZmluYW5jZS9la29ub21pa2EtcmYtb2themFsYXMtZ29yYXpkby1ib2xlZS11c3RveWNoaXZveS1rLXNhbmtjaXlhbS1jaGVtLWR1bWFsaS1uYS16YXBhZGUtbnl0LW5vdm9zdGktdWtyYWluYS1hbXAtMTI0MTk5MzEuaHRtbA?oc=5&hl=en-US&gl=US&ceid=US:en
  2%|▏         | 22/1031 [00:33<28:39,  1.70s/it]ERROR:gnews.gnews:Article `download()` failed with 403 Client Error: Forbidden for url: https://news.ru/press-relizy/shkolnikam-rasskazali-kak-kruto-i-modno-byt-professionalom/ on URL https

In [6]:
# summarize the news (select 2 most important sentences)
summarizer_2 = LsaSummarizer()
nltk.download('punkt')

def summarize(text, language="russian", sentences_count=2):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

# iterate over news
for news in tqdm_notebook(rus_econ_news):
    try:
        news['summary'] = summarize(news['full_article'], language="russian", sentences_count=2)
    except:
        news['summary'] = ''

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
100%|██████████| 1031/1031 [01:04<00:00, 15.95it/s]


In [7]:
last_news = pd.DataFrame(rus_econ_news)
# replace missing articles with descriptions
last_news['full_article'].fillna(last_news['description'], inplace=True)
last_news['summary'].fillna(last_news['description'], inplace=True)

In [8]:
# convert to date
last_news['date'] = pd.to_datetime(last_news['published date'])
# calculate week number
last_news['week'] = last_news['date'].dt.isocalendar().week
# check the date range
last_news['date'].min(), last_news['date'].max()

(Timestamp('2023-10-09 07:00:00+0000', tz='UTC'),
 Timestamp('2023-10-19 18:32:00+0000', tz='UTC'))

In [10]:
last_news.to_pickle(data_path+f'last_news_{start_date}-{end_date}.pkl')

## Predict importance
- embeddings
- predict importance

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')

Downloading (…)9f719/.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Downloading (…)316e29f719/README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

Downloading (…)6e29f719/config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)719/onnx/config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/546k [00:00<?, ?B/s]

Downloading model.onnx_data:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)e29f719/modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

In [12]:
# embed the news
texts = last_news['summary'].to_list()
embeddings = model.encode(texts, show_progress_bar=True)

last_news["emb_e5"] = [row.reshape(-1) for row in embeddings]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

In [14]:
# predict the importance
clf = pickle.load(open(model_path+'news_importance_model.pkl', 'rb'))
last_news['prob_import'] = clf.predict_proba(embeddings)[:,1]
# mark the news with prob > 0.38 as important
last_news['important'] = np.where(last_news['prob_import'] > 0.38, 1, 0)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
# print the number of important news of the total
print(f"Number of important news: {last_news.important.sum()} of total {last_news.important.count()}")

Number of important news: 58 of total 1031


In [16]:
last_news.to_pickle(data_path+f'last_news_{start_date}-{end_date}.pkl')

# Evaluate news (pos-neg)

In [17]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



In [18]:
last_imp_news = last_news[last_news.important==1]

In [21]:
# NLI fr imprt news. Candidate labels can be adjusted
bad_news = ['плохо для российской экономики', 'российская экономика ухудшается', 'негативная новость для экономики России']
good_news = ['хорошо для российской экономики', 'российская экономика  улучшается', 'позитивная новость для экономики России']
candidate_labels = bad_news + good_news
n_labels = len(candidate_labels)

for id, row in tqdm_notebook(last_imp_news.iterrows(), total = last_news[last_news.important==1].shape[0]):
    output = classifier(row['summary'], candidate_labels, multi_label=True)
    for i_label in range(n_labels):
      last_imp_news.at[id, output['labels'][i_label]] = output['scores'][i_label]


last_imp_news['bad_news'] = last_imp_news[bad_news].mean(axis=1)
last_imp_news['good_news'] = last_imp_news[good_news].mean(axis=1)

  0%|          | 0/58 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_imp_news.at[id, output['labels'][i_label]] = output['scores'][i_label]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_imp_news.at[id, output['labels'][i_label]] = output['scores'][i_label]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_imp_news.at[id, output['labels'][i_label]] = out

In [23]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

class_labels = ['bad_news', 'good_news']
for label in class_labels:
    last_imp_news[['bad_news_prob', 'good_news_prob']] = last_imp_news.apply(lambda row: softmax(row[class_labels].values.tolist()), axis=1, result_type='expand')

In [24]:
last_imp_news['направление_pred'] = (last_imp_news['good_news_prob']-0.5)*10

In [34]:
# combine all news - non-important and important with pos-neg predictions
last_news = pd.concat([last_news[last_news.important==0], last_imp_news])
last_news.sort_values('date', inplace=True)

# save the news
cols = ['published date', 'week', 'title', 'url', 'publisher', 'full_article', 'summary', 'prob_import', 'important', 'bad_news_prob', 'good_news_prob', 'направление_pred']
last_news.to_pickle(data_path+f'last_news_{start_date}-{end_date}.pkl')
last_news[cols].to_excel(data_path+f'last_news_{start_date}-{end_date}.xlsx', index=False)

In [55]:
last_news

Unnamed: 0,title,description,published date,url,publisher,full_article,summary,date,week,emb_e5,...,плохо для российской экономики,негативная новость для экономики России,российская экономика улучшается,хорошо для российской экономики,позитивная новость для экономики России,bad_news,good_news,bad_news_prob,good_news_prob,направление_pred
55,За последние 20 лет в России снизилось неравен...,За последние 20 лет в России снизилось неравен...,"Mon, 09 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,"{'href': 'https://rns.online', 'title': 'rns.o...",Разрыв зарплат в России снижается на протяжени...,Самый существенный рост зарплат отмечены у ква...,2023-10-09 07:00:00+00:00,41,"[-0.00011324474, 0.024859162, -0.020225069, -0...",...,,,,,,,,,,
22,"Школьникам рассказали, как «круто и модно» быт...","Школьникам рассказали, как «круто и модно» быт...","Mon, 09 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiWGh0d...,"{'href': 'https://news.ru', 'title': 'NEWS.ru'}",,,2023-10-09 07:00:00+00:00,41,"[0.008210542, -0.015223292, -0.027736258, -0.0...",...,,,,,,,,,,
38,Губернатор Владимир Мазур предложил томским фи...,Губернатор Владимир Мазур предложил томским фи...,"Mon, 09 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMilgFod...,"{'href': 'https://www.tvtomsk.ru', 'title': 't...",Фото: пресс-служба администрации Томской облас...,"Напомним, всероссийский конкурс управленцев но...",2023-10-09 07:00:00+00:00,41,"[2.917545e-05, -0.033072747, -0.012222579, -0....",...,,,,,,,,,,
46,План-капкан: чем больше развивают Дальний Вост...,План-капкан: чем больше развивают Дальний Вост...,"Mon, 09 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiLWh0d...,"{'href': 'https://76.ru', 'title': 'Новости Яр...","Что делать?\n\nЛюди перестанут уезжать, когда ...",Но для этого необходимо изменить саму структур...,2023-10-09 07:00:00+00:00,41,"[0.015978437, -0.038692374, -0.01651187, -0.03...",...,,,,,,,,,,
50,ВТБ: доля китайских брендов в выдачах автокред...,ВТБ: доля китайских брендов в выдачах автокред...,"Mon, 09 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiYGh0d...,"{'href': 'https://newbur.ru', 'title': 'https:...",Россияне все активней пересаживаются на машины...,Продажи таких машин уже опережают не только по...,2023-10-09 07:00:00+00:00,41,"[0.009121809, -0.03703046, -0.0065047983, -0.0...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,Несырьевой экспорт из России проседает / Эконо...,Несырьевой экспорт из России проседает / Эконо...,"Thu, 19 Oct 2023 17:22:09 GMT",https://news.google.com/rss/articles/CBMiOWh0d...,"{'href': 'https://www.ng.ru', 'title': 'Незави...",Чиновники готовят новую стратегию внешней торг...,«Сейчас это наш крупнейший внешнеэкономический...,2023-10-19 17:22:09+00:00,42,"[0.0055478225, -0.002057011, -0.0048143077, -0...",...,,,,,,,,,,
1023,Минсельхоз и ключевые производители мяса птицы...,Минсельхоз и ключевые производители мяса птицы...,"Thu, 19 Oct 2023 17:34:38 GMT",https://news.google.com/rss/articles/CBMifWh0d...,"{'href': 'https://fomag.ru', 'title': 'Новостн...",Минсельхоз и ключевые производители мяса птицы...,Минсельхоз России совместно с производителями ...,2023-10-19 17:34:38+00:00,42,"[0.022367598, -0.045439843, -0.013758693, -0.0...",...,,,,,,,,,,
970,Госдеп США предупредил американцев о возросшей...,Госдеп США предупредил американцев о возросшей...,"Thu, 19 Oct 2023 17:41:27 GMT",https://news.google.com/rss/articles/CBMiGWh0d...,"{'href': 'https://life.ru', 'title': 'Life.ru'}",Государственный департамент США выступил с пре...,Государственный департамент США выступил с пре...,2023-10-19 17:41:27+00:00,42,"[-0.009018311, -0.004399806, -0.017311823, -0....",...,,,,,,,,,,
943,Китай становится основным покупателем российск...,Китай становится основным покупателем российск...,"Thu, 19 Oct 2023 17:52:48 GMT",https://news.google.com/rss/articles/CBMiNmh0d...,"{'href': 'https://www.ng.ru', 'title': 'Незави...",Заместить потерянные объемы европейского рынка...,Надежды на скорое замещение Китаем потерянной ...,2023-10-19 17:52:48+00:00,42,"[0.0007288877, 0.027896825, -0.03777971, -0.03...",...,,,,,,,,,,


In [56]:
cols = ['published date', 'week', 'title', 'url', 'publisher', 'full_article', 'summary', 'prob_import', 'important', 'bad_news_prob', 'good_news_prob', 'направление_pred']
last_news[cols].to_excel(data_path+f'last_news_{start_date}-{end_date}.xlsx', index=False)