## 1. Install and Import libraries

In [None]:
!pip install arch

In [None]:
!pip install numba

In [None]:
#NLP Transformers library
from transformers import AutoTokenizer, Trainer,  DataCollatorWithPadding, AutoModel,AutoModelForSequenceClassification, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
#ARCH modeling
import arch
import torch
import wandb
#plots
import matplotlib.pyplot as plt 
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

## 2. Data loading and preprocessing

In [None]:
# read daily MOEX index data from 2015 to 2022
moex_data = pd.read_csv('/kaggle/input/moex-news-russian/IMOEX_150101_200101.csv',sep = ";")
# data from COVID period
covid_moex_data = pd.read_csv('/kaggle/input/moex-news-russian/IMOEX_200101_220101.csv',sep = ";")
full_data = pd.concat([moex_data,covid_moex_data])

In [None]:
full_data = full_data.reset_index(drop=True)

In [None]:
# calculate log-returns
moex_data['return'] = 100 * moex_data['<CLOSE>'].pct_change()
moex_data['log_ret'] = np.log(moex_data['<CLOSE>']/moex_data['<CLOSE>'].shift(1))
moex_data = moex_data.dropna()
#plot autocorrelation
plt.plot(moex_data['log_ret'])
plot_pacf(moex_data['log_ret'])
plot_acf(moex_data['log_ret'])

In [None]:
# fit GARCH model without news indexes
model = arch.arch_model(moex_data['return'],mean='AR',p=1,o=1,dist='normal',lags=0)
res = model.fit()
print(res.summary)

moex_data[['<DATE>','<CLOSE>','return','log_ret']].to_csv('returns.csv')

In [None]:
moex_data[['<DATE>','<CLOSE>','return','log_ret']].to_csv('returns.csv')

# read news data
data = pd.read_csv('/kaggle/input/moex-news-russian/moex_data_1.csv',encoding='utf16',sep=',',header=0,quotechar='"',engine="python")
data['index'] = data['Unnamed: 0'].str.split(',')
data['class'] = data['index'].apply(lambda x :x[-1])
data['index'] = data['index'].apply(lambda x :x[0]).astype(int)

In [None]:
og = pd.read_csv('/kaggle/input/moex-news-russian/news_lenta/moex_data.csv',encoding='utf32')
og['index'] = og['Unnamed: 0']
og = og.drop(columns='class')

fin = og.merge(data[['index','class']],how='inner',on="index")
fin = fin[['index','text','pubdate','tokens','class']]
fin['date']=pd.to_datetime(fin['pubdate'],unit='s')
fin = fin.reset_index()
fin['date']= fin['date'].dt.date
fin['date'] = pd.to_datetime(fin['date']).dt.normalize()
#fin = fin.set_index('date')
fin['class'] = fin['class'].astype(int)
fin = fin.rename(columns={"class":"labels"})
s = fin[['date','text','tokens','labels']].copy()

In [None]:
train = s[s.date<pd.to_datetime("2019-01-01")].copy() 
test = s[s.date>=pd.to_datetime("2019-01-01")].copy()
news_train = Dataset.from_pandas(train[['date','text','labels']])
news_test = Dataset.from_pandas(test[['date','text','labels']])

news_dataset = DatasetDict()
news_dataset['train'] = news_train 
news_dataset['test'] = news_test

In [None]:
s = s[['date','labels']]
s = s.set_index('date')

moex_data['date'] = pd.to_datetime(moex_data['<DATE>'].astype(str))
print(moex_data['date']) 
mean_ind = s.resample('D').mean()
mean_ind = mean_ind.fillna(0.5)
mean_ind = moex_data.merge(mean_ind,on='date',how='left')
mean_ind = mean_ind[['date','labels']]
mean_ind.to_csv('news_data.csv')

In [None]:
# fit model GARCH with exogenous variable(p=1,o=1 setup)
news_model = arch.arch_model(y = moex_data['return'],x=mean_ind['labels'],
                             mean='ARX', p=1,o=1, dist="normal",lags=0)
res = news_model.fit()
print(res.summary)
print(res.conditional_volatility)

## 3. Pretrained model for general russian sentiment

In [None]:
#model and tokenizer initialization(ruRoberta-large)
tokenizer = AutoTokenizer.from_pretrained("sismetanin/xlm_roberta_large-ru-sentiment-rusentiment")
model = AutoModelForSequenceClassification.from_pretrained("sismetanin/xlm_roberta_large-ru-sentiment-rusentiment")

In [None]:
wandb.init(mode='disabled')

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=512,return_tensors="pt")

tokenized_datasets = news_dataset.map(tokenize_function)

In [None]:
preds = []

In [None]:
with torch.no_grad():
    for i in tokenized_datasets['test']['input_ids']:
        inputs = torch.tensor(i)
        preds.append(model(torch.tensor(inputs).cuda()).logits)


In [None]:
p = [torch.softmax(torch.tensor(i),dim=1) for i in preds]

In [None]:
p = [x.cpu().detach().numpy()[0] for x in p]

In [None]:
pd.DataFrame(p).to_csv('rusent_preds15_20.csv')

## 4. Training on FiNeS dataset

### 4.1 Baselines

### 4.2 Fine-tuning BERT model

In [None]:
#model and tokenizer initialization(ruRoberta-large)
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base", num_labels=2)

In [None]:
#turn off base_model trainig
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir='./bert_clf',
    learning_rate=1e-3,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy='epoch'
)

In [None]:
#turn off base_model trainig
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir='./bert_clf',
    learning_rate=1e-3,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy='epoch'
)

## 5. Fine-tuning russian LLM on manually labeled dataset

### 5.1 Baseline models

### 5.2 Fine-tuning BERT model

In [None]:
#model and tokenizer initialization(ruRoberta-large)
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base", num_labels=2)


In [None]:
#turn off base_model trainig
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir='./bert_clf',
    learning_rate=1e-3,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy='epoch'
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = news_dataset.map(tokenize_function, batched=True, batch_size=8)

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    tokenizer = tokenizer
    )

trainer.train()



In [None]:
torch.cuda.empty_cache()

In [None]:
predictions = trainer.predict(tokenized_datasets['test'])

In [None]:
preds = torch.softmax(torch.tensor(predictions.predictions),dim=1)

In [None]:
preds

In [None]:
test['prob_0'] = preds[:,1]
test['prob_1'] = preds[:,0]

In [None]:
test.to_csv("preds.csv",encoding='utf-32')

## 6. Training on auto-labeled dataset

### Extra