# INFERENCE
- load news
- select important ones (via model)
- evaluate news - positive vs negative

### constants and imports

In [1]:
# check env - google colab or local
import sys

def check_environment():
    if 'google.colab' in sys.modules:
        # Running in Google Colab
        return "Google Colab"

    if hasattr(sys, 'prefix'):
        if sys.prefix.startswith('/usr/local'):
            # Running in a Colab-like environment (could be Colab or another cloud environment)
            return "Colab-like environment"
        else:
            # Running in a local environment
            return "Local environment"

    # Default to local environment if checks are inconclusive
    return "Local environment"

In [2]:
# set paths and install packages depending on the environment

if check_environment() == 'Google Colab':
    !pip install --quiet sentence_transformers transformers sumy nltk gnews newspaper3k

    from google.colab import drive
    drive.mount('/content/drive')

    path = '/content/drive/MyDrive/Colab Notebooks/other/econ_news(Kulbaka)/'

else:
    path = ''

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from gnews import GNews
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import nltk
from sumy.summarizers.lsa import LsaSummarizer

import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm



# Load news (incl full texts)

In [None]:
# set today's date
# date = pd.to_datetime('today').date()

# google_news = GNews(language='ru', country='RU', start_date=date - pd.Timedelta(days=1), end_date=date)
# today_news = google_news.get_news('Экономика России')

In [4]:
# download news day by day (due to 100 limit) via combining dictionaries
start_date = '2023-10-03'
end_date = '2023-10-10'

rus_econ_news = []
for date in tqdm(pd.date_range(start_date, end_date)):
    google_news = GNews(language='ru', country='RU', start_date=date - pd.Timedelta(days=1), end_date=date)
    rus_econ_news += google_news.get_news('Экономика России')

100%|██████████| 8/8 [01:49<00:00, 13.68s/it]


In [5]:
# get full articles following the links
for news in tqdm(rus_econ_news):
    try:
        # get the full article
        news['full_article'] = google_news.get_full_article(news['url']).text
    except:
        news['full_article'] = ''

  1%|          | 5/717 [00:11<28:13,  2.38s/it]ERROR:gnews.gnews:Article `download()` failed with 403 Client Error: Forbidden for url: https://news.ru/dengi/ekonomist-predrek-dollar-vyshe-100-v-konce-oktyabrya/ on URL https://news.google.com/rss/articles/CBMiS2h0dHBzOi8vbmV3cy5ydS9kZW5naS9la29ub21pc3QtcHJlZHJlay1kb2xsYXItdnlzaGUtMTAwLXYta29uY2Utb2t0eWFicnlhL9IBT2h0dHBzOi8vbmV3cy5ydS9hbXAvZGVuZ2kvZWtvbm9taXN0LXByZWRyZWstZG9sbGFyLXZ5c2hlLTEwMC12LWtvbmNlLW9rdHlhYnJ5YS8?oc=5&hl=en-SG&gl=SG&ceid=SG:en
  4%|▍         | 28/717 [01:01<31:41,  2.76s/it]ERROR:gnews.gnews:Article `download()` failed with HTTPSConnectionPool(host='www.mos.ru', port=443): Max retries exceeded with url: /news/item/130359073/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x78f5736d9d20>, 'Connection to www.mos.ru timed out. (connect timeout=7)')) on URL https://news.google.com/rss/articles/CBMiJ2h0dHBzOi8vd3d3Lm1vcy5ydS9uZXdzL2l0ZW0vMTMwMzU5MDczL9IBAA?oc=5&hl=en-SG&gl=SG&ceid=SG:en
  4%

In [6]:
with open(path+'rus_econ_news.pkl', 'wb') as f:
    pickle.dump(rus_econ_news, f)

In [7]:
# summarize the news (select 2 most important sentences)
summarizer_2 = LsaSummarizer()
nltk.download('punkt')

def summarize(text, language="english", sentences_count=5):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

# iterate over news
for news in tqdm(rus_econ_news):
    try:
        news['summary'] = summarize(news['full_article'], language="russian", sentences_count=2)
    except:
        news['summary'] = ''

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 717/717 [00:54<00:00, 13.25it/s]


In [8]:
last_news = pd.DataFrame(rus_econ_news)
# replace missing articles with descriptions
last_news['full_article'].fillna(last_news['description'], inplace=True)
last_news['summary'].fillna(last_news['description'], inplace=True)

In [9]:
last_news.to_csv(path+'last_news.csv')

## Select important news
- embeddings
- predict importance

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')

In [11]:
# embed the news
texts = last_news['summary'].to_list()
embeddings = model.encode(texts, show_progress_bar=True)

last_news["emb_e5"] = [row.reshape(-1) for row in embeddings]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

In [13]:
# predict the importance
clf = pickle.load(open(path+'news_importance_model.pkl', 'rb'))
last_news['prob_import'] = clf.predict_proba(embeddings)[:,1]
# mark the news with prob > 0.38 as important
last_news['important'] = np.where(last_news['prob_import'] > 0.38, 1, 0)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_news[last_news['prob_import'] > .38]['important'] = 1


In [14]:
last_news.to_csv(path+'last_news.csv')

In [29]:
last_news.important.sum(), last_news.important.count()

(36, 717)

# Evaluate news

In [30]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
last_imp_news = last_news[last_news.important==1]

In [73]:
# NLI fr imprt news. Candidate labels can be adjusted
bad_news = ['плохо для российской экономики', 'российская экономика ухудшается', 'негативная новость для экономики России']
good_news = ['хорошо для российской экономики', 'российская экономика  улучшается', 'позитивная новость для экономики России']
candidate_labels = bad_news + good_news
n_labels = len(candidate_labels)

for id, row in tqdm(last_imp_news.iterrows(), total = last_news[last_news.important==1].shape[0]):
    output = classifier(row['summary'], candidate_labels, multi_label=True)
    for i_label in range(n_labels):
      last_imp_news.at[id, output['labels'][i_label]] = output['scores'][i_label]

100%|██████████| 36/36 [01:53<00:00,  3.16s/it]


TypeError: ignored

In [75]:
last_imp_news['bad_news'] = last_imp_news[bad_news].mean(axis=1)
last_imp_news['good_news'] = last_imp_news[good_news].mean(axis=1)

In [84]:
last_imp_news.to_csv(path+'last_imp_news.csv')

In [None]:
last_imp_news = pd.read_csv(path+'last_imp_news.csv')

In [1]:
last_imp_news[['published date','summary', 'prob_import', 'bad_news', 'good_news']].sort_values('bad_news', ascending=False)

NameError: ignored