## Load models

In [1]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [3]:
from google.colab import drive
drive.mount ('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# load models
device = torch.device("cuda")
model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [None]:
premise = "Angela Merkel ist eine Politikerin in Deutschland und Vorsitzende der CDU"
hypothesis = "Emmanuel Macron is the President of France"

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

## News NLI (Top news by cosine similarity for each narrative)

In [5]:
df_ukr = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TG news channels/filtered/df_war_ukr_simil_narratives.csv')
df_ukr.shape

FileNotFoundError: ignored

In [None]:
df_ukr.columns

In [None]:
df_narratives = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TG news channels/filtered/df_narratives_ada_emb.csv')

In [None]:
 # create database narratives vs 1K most similar news
top_n = 10000 # numbr of top news texts
df_list = []
for row in df_narratives.iterrows():
    n = row[0]
    # sort by similarity and keep cols and similarity column
    cols = ['media_type_y', 'content_type', 'channel_id', 'channel_name', 'msg_id', 'cleaned_message', 'date', 'views', f"similarity {n}"]
    df = df_ukr.sort_values(by=f"similarity {n}", ascending=False).head(top_n)[cols]
    df.rename(columns={f"similarity {n}": "similarity"}, inplace=True)
    df['narrative_id'] = row[0]
    df['narrative'] = row[1]['narrative']
    df_list.append(df)

In [None]:
df_nar_news_sample = pd.concat(df_list, ignore_index=True)
df_nar_news_sample.shape

(270000, 11)

In [None]:
df_nar_news_sample.head(3)

Unnamed: 0,media_type_y,content_type,channel_id,channel_name,msg_id,cleaned_message,date,views,similarity,narrative_id,narrative
0,tg - pers,propaganda,1315735637,SolovievLive,128720,Фейк: В рамках частичной мобилизации планирует...,2022-09-22,318053,0.880541,0,"Это частичная мобилизация, затронет всего лишь..."
1,tv,propaganda,1036362176,rt_russian,119879,"«Мобилизация проходит в режиме нон-стоп, согла...",2022-07-14,117514,0.87635,0,"Это частичная мобилизация, затронет всего лишь..."
2,tv,propaganda,1036362176,rt_russian,119879,"«Мобилизация проходит в режиме нон-стоп, согла...",2022-07-14,117514,0.876324,0,"Это частичная мобилизация, затронет всего лишь..."


In [None]:
df_nar_news_sample.drop_duplicates(subset=['channel_id', 'msg_id','narrative_id'], inplace=True)

In [None]:
df_nar_news_sample.reset_index(inplace=True)
print(f"N of news-narrtive pairs after deduplication{df_nar_news_sample.shape[0]}")

(241584, 14)

In [None]:
for i in tqdm(range(df_nar_news_sample.shape[0])):
  premise = df_nar_news_sample.loc[i, 'cleaned_message']
  hypothesis = df_nar_news_sample.loc[i, 'narrative']
  input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
  output = model(input["input_ids"].to(device))
  prediction = torch.softmax(output["logits"][0], -1).tolist()
  label_names = ["entailment", "neutral", "contradiction"]
  prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
  df_nar_news_sample.loc[i, 'yes'] = prediction['entailment']
  df_nar_news_sample.loc[i, 'no'] = prediction['contradiction']

100%|██████████| 241584/241584 [2:05:27<00:00, 32.09it/s]


In [None]:
df_nar_news_sample.to_csv('/content/drive/MyDrive/Colab Notebooks/TG news channels/filtered/df_ukr_news_vs_narr.csv')

In [None]:
df_nar_news_sample.to_excel('/content/drive/MyDrive/Colab Notebooks/TG news channels/filtered/df_ukr_news_vs_narr.xlsx')

## News NLI for various levels of cosine similarity (batch prediction)

In [9]:
df_ukr = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Narratives/TG news channels/filtered/df_war_ukr_simil_narratives.csv')
df_ukr.shape

(127986, 37)

In [10]:
df_ukr.drop_duplicates(subset=['channel_id', 'msg_id'], inplace=True)
df_ukr.shape

(114809, 37)

In [11]:
df_ukr.columns

Index(['media_type_y', 'content_type', 'channel_id', 'channel_name', 'msg_id',
       'message', 'cleaned_message', 'date', 'views', 'ada_embedding',
       'similarity 0', 'similarity 1', 'similarity 2', 'similarity 3',
       'similarity 4', 'similarity 5', 'similarity 6', 'similarity 7',
       'similarity 8', 'similarity 9', 'similarity 10', 'similarity 11',
       'similarity 12', 'similarity 13', 'similarity 14', 'similarity 15',
       'similarity 16', 'similarity 17', 'similarity 18', 'similarity 19',
       'similarity 20', 'similarity 21', 'similarity 22', 'similarity 23',
       'similarity 24', 'similarity 25', 'similarity 26'],
      dtype='object')

In [13]:
df_narratives = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Narratives/TG news channels/filtered/df_narratives_ada_emb.csv')
df_narratives.shape

(27, 3)

In [None]:
# load models
device = torch.device("cuda")
model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

In [20]:
test = df_ukr.cleaned_message.head(1000).tolist()
narratives_test = df_narratives.narrative.sample(4).to_list()
# tokenizer(test)

In [41]:
from datetime import datetime

# CUDA is explicitly called (device = 0)
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli", device = 0)

sequence_to_classify = test
candidate_labels = narratives_test
time1 = datetime.now()
output = classifier(sequence_to_classify, candidate_labels, multi_label=False, batch_size = 5)
print(datetime.now() - time1)
print(output)

0:01:13.764912
[{'sequence': '‼Ночные бои на Запорожье: попытка прорыва спецгруппы ВСУ Наши разведчики с помощью квадрокоптера Mavic с тепловизором ночью обнаружили передвижение украинского спецназа на линии фронта. По вражеской группе оперативно был нанесён удар артиллерией и точнейшая очередь расчёта АГС. Лишь один вражеский диверсант поднялся и сбежал после накрытия автоматическим гранатомётом. t.me/RVvoenkor', 'labels': ['На Украине притесняют русских и русский язык, геноцид русских', 'НАТО / США угрожают России ядерным оружием', 'В войне виноваты США / Запад / НАТО, чтобы развалить Россию', 'Украина подделывает фотографии/свидетельства, создает фейки про Россию'], 'scores': [0.3854154348373413, 0.2641448378562927, 0.18079224228858948, 0.1696474552154541]}, {'sequence': '‼Сводка Минобороны о ходе спецоперации: ◽ На Купянском направлении артиллерия «Западной» группировки войск нанесла огневое поражение живой силе и технике подразделений 114-й и 103-й бригад территориальной обороны в

In [24]:
output[0]['scores']

{'sequence': '‼Ночные бои на Запорожье: попытка прорыва спецгруппы ВСУ Наши разведчики с помощью квадрокоптера Mavic с тепловизором ночью обнаружили передвижение украинского спецназа на линии фронта. По вражеской группе оперативно был нанесён удар артиллерией и точнейшая очередь расчёта АГС. Лишь один вражеский диверсант поднялся и сбежал после накрытия автоматическим гранатомётом. t.me/RVvoenkor',
 'labels': ['На Украине притесняют русских и русский язык, геноцид русских',
  'НАТО / США угрожают России ядерным оружием',
  'В войне виноваты США / Запад / НАТО, чтобы развалить Россию',
  'Украина подделывает фотографии/свидетельства, создает фейки про Россию'],
 'scores': [0.3854112923145294,
  0.26415038108825684,
  0.18079052865505219,
  0.16964778304100037]}

1K texts * 4 classes (full narratives):
- 1.47min with CUDA (batch = 1) - 9texts/sec
- 1.18min with CUDA (batch = 2) - 13texts/sec
- 1.13min with CUDA (batch = 5) - 13.5texts/sec
- 1.15min with CUDA (batch = 10)
- 1.17min with CUDA (batch = 20)


100 texts * 4 classes (class = word):
- 5 min with no CUDA (batch = 2)
- 12 sec with CUDA (batch = 2) - 8/sec
- 17 sec with CUDA (batch = 20) - 6/sec

10K texts * 4 classes (class = word):
- 13 min with CUDA (batch = 20) - 13/sec
- 14 min with CUDA (batch = 200) - 13/sec (RAM at its max 15Gb, possibly did not finish)
- 13 min with CUDA (batch = 80) -

In [None]:
premise = "Angela Merkel ist eine Politikerin in Deutschland und Vorsitzende der CDU"
hypothesis = "Emmanuel Macron is the President of France"

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

{'entailment': 0.1, 'neutral': 0.9, 'contradiction': 99.1}
