## load model bert-large (15 languages)

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="joeddav/xlm-roberta-large-xnli")

Downloading:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:01<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
import pandas as pd
import re
from tqdm import tqdm

## load functions

In [None]:
def clean_text(newtext):
  newtext = str(newtext)
  stopwords = [
  "'type': 'bold'"
  ,"'type': 'italic'"
  , "'type': 'mention', 'text': '@lentadnya'"
  ,"'href': " #убрать весь адрес http
  ,"'type': 'text_link'"
  ,"'text':"
  ,"{" ,"}"
  ,', ,'
  , "[", "]"
  , "'"
  ]

  for stopword in stopwords:
    newtext = newtext.replace(stopword, "")

  replace_spaces =["\\n", "  ", ", "]
  for replace_space in replace_spaces:
    newtext = newtext.replace(replace_space, " ")

  # лишние пробелы похоже не влияют на предсказания модели. Зато эмоджи влияют. Оставляем только текстовые знаки и препинания.
  newtext = re.sub('[^А-Яа-яЁёA-Za-z0-9 _.,!-—?"«»]*', "", newtext)
  return newtext

In [None]:
def read_clean_json_news(file_path, date_from='2022-02-01'):
  # read json-file to dataframe
  df_js = pd.read_json(file_path)
  df_full = pd.DataFrame(df_js['messages'].tolist())
  df_full = df_full[['id', 'date', 'text']]
  df_full['date'] = pd.to_datetime(df_full['date'])
  # select specific dates
  df_test = df_full[df_full['date'] > date_from]
  # clean text of news and put into 'news' column
  df_test['news'] = df_test.apply(lambda row: clean_text(row['text']), axis=1)
  df_test.drop('text', axis=1, inplace=True)
  # clean of empty news
  df_test = df_test[df_test['news'].str.len()>5]
  return df_test

In [None]:
def filter_war_news(df_test):
  df_war_news = df_test[
        (df_test['news'].str.contains('воен|войн|спецоперац|обстрел|арм', case=False)) 
        & ~(df_test['news'].str.contains('Главное к утру|Главные события|Главные новости', case=True)) #убираем сводки/дайджесты из нескольких новостей
        & (df_test['news'].str.contains('укр|ВСУ|Азов', case=False))
        ]
  return df_war_news

In [None]:
# for second loop of news check on key label
# news_col - dataframe column with news
def encode_news_key_label(df, candidate_labels, multi_label=True, key_label=0, news_col='news', sublabels=0, threshold=0.9, multi_sublabel=False):
  total = df.shape[0]
  counter = 0 #to count iteterations for temp saving
  print('started encoding file: {}'.format(file_path))
  for index, row in tqdm(df.iterrows(), total = total):
    news = row['news']
    if len(news) < 5: continue #skip news if too short (less than 5 symbols)
    result = classifier(news, candidate_labels, multi_label=True)
    for label, score in zip(result['labels'],result['scores']):
      df.loc[index, label] = score
      if key_label !=0 and label == key_label: 
        k_label_threshold = "{}>{}".format(key_label, threshold) #name for check for key_label > threshold
        if score > threshold:
          df.loc[index, k_label_threshold] = 1
          result_keys = classifier(news, sublabels, multi_label=False)
          for sublabel, subscore in zip(result_keys['labels'],result_keys['scores']):
            df.loc[index, sublabel] = subscore
        else: df.loc[index, k_label_threshold] = 0
    counter += 1
    if counter % 100 == 0: #save twmp results every 500 iterations
      df.to_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/news TG channels/temp_result.csv')
      print('saved intermediate at {} iterations'.format(counter))
  return df

## execution

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 1) NEWS LOAD & PRER
# PARAMETERS
path = '/content/drive/MyDrive/Colab Notebooks/Datasets/news TG channels/meduza - from jan22.json'
date_from = '2022-05-01'

df_news = read_clean_json_news(path, date_from)
df_war_news = filter_war_news(df_news) # run this to filter by key words on war

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# 2) NEWS ENCODE
# PARAMETERS
candidate_labels = [
                    'военные действия в Украине'
                    ]
key_label = 'военные действия в Украине' #must be one of candidate labels
threshold = 0.85 # can be 0.0-1.0, default = 0.9
sublabels = ['Украина атакует мирных граждан', 'Россия атакует мирных граждан', 'Украина атакует военных', 'Россия атакует военных'
              , 'ВСУ атакует мирных граждан', 'ВСУ атакует военных']

df_fin = encode_news_key_label(df_war_news, candidate_labels=candidate_labels, multi_label=True
                               ,key_label=key_label, news_col='news', sublabels=sublabels, threshold=0.9, multi_sublabel=False
                               )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
 40%|████      | 303/755 [1:21:03<1:17:17, 10.26s/it]

In [None]:
# sublabels by days
df_fin.resample('W', on='date').sum().plot.bar(y=sublabels, stacked=True, figsize=(10,5))

In [None]:
df_fin.resample('W', on='date').sum()

In [None]:
df_fin.to_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/news TG channels/results/meduza_war_may1.csv')

In [None]:
df_news_enc = df_fin.copy()
df_news_enc['Украина/ВСУ атакует мирных граждан'] = df_news_enc['Украина атакует мирных граждан'] + df_news_enc['ВСУ атакует мирных граждан']
df_news_enc['Украина/ВСУ атакует военных'] = df_news_enc['Украина атакует военных'] + df_news_enc['ВСУ атакует военных']
df_news_enc.drop(['Украина атакует военных', 'ВСУ атакует военных','Украина атакует мирных граждан','ВСУ атакует мирных граждан'], axis=1, inplace=True)

In [None]:
# sublabels by days
df_news_enc.resample('W', on='date').sum().plot.bar(y=['Украина/ВСУ атакует мирных граждан', 'Украина/ВСУ атакует военных', 'Россия атакует мирных граждан', 'Россия атакует военных'], stacked=True, figsize=(10,5))

In [None]:
df_news_enc.resample('W', on='date').sum()

In [None]:
df_news_enc.to_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/news TG channels/results/meduza_war_may1.csv')

## check results

In [None]:
import pandas as pd

In [None]:
df_res = pd.read_csv(('/content/drive/MyDrive/Colab Notebooks/Datasets/news TG channels/lenta_result.csv'))

In [None]:
df_res = df_res[['date', 'news', 'война в Украине']]

In [None]:
df_res[(df_res['news'].str.contains('воен|войн', case=False)) & (df_res['news'].str.contains('укр', case=False))].shape[0]

641

In [None]:
df_res[df_res['война в Украине']>0.9].shape[0]

1255

In [None]:
df_res[
        (df_res['война в Украине']<0.9) & 
        (df_res['news'].str.contains('воен|войн|спецоперац|обстрел|арм', case=False)) 
        & ~(df_res['news'].str.contains('Главное к утру|Главные события|Главные новости', case=True)) #убираем сводки нескольких новостей
        & (df_res['news'].str.contains('укр|ВСУ|Азов', case=False))
        ]

Unnamed: 0,date,news,война в Украине
136,2022-01-04 23:13:01,Несколько заявлений от Госдепа США: США расс...,0.840355
429,2022-01-07 19:17:14,НАТО будет вести диалог по безопасности с Ро...,0.823539
434,2022-01-07 20:49:01,США обсудили с Украиной предстоящие перегово...,0.772166
487,2022-01-08 21:57:01,Белый дом сделал несколько заявлений по повод...,0.005688
848,2022-01-16 09:56:07,Дмитрий Песков дал интервью CNN. Главные заяв...,0.218746
...,...,...,...
12350,2022-05-20 22:10:20,Владимир Зеленский заявил что украинские сил...,0.689000
12351,2022-05-20 22:46:30,Владимир Зеленский предложил иностранным госу...,0.502583
12371,2022-05-21 11:33:21,Российские военные уничтожили крупную партию...,0.344142
12394,2022-05-21 17:21:30,Встречайте обновлённую «Путинку» — теперь с ...,0.829186


In [None]:
# удалять
ДАННОЕ СООБЩЕНИЕ (МАТЕРИАЛ) СОЗДАНО И (ИЛИ) РАСПРОСТРАНЕНО ИНОСТРАННЫМ СРЕДСТВОМ МАССОВОЙ ИНФОРМАЦИИ, ВЫПОЛНЯЮЩИМ ФУНКЦИИ ИНОСТРАННОГО АГЕНТА, И (ИЛИ) РОССИЙСКИМ ЮРИДИЧЕСКИМ ЛИЦОМ, ВЫПОЛНЯЮЩИМ ФУНКЦИИ ИНОСТРАННОГО АГЕНТА