In [1]:
import pandas as pd
import torch

from tqdm.notebook import tqdm
from transformers import AutoModelForSequenceClassification, BertTokenizerFast

from ydata_profiling import ProfileReport

In [3]:
articles = pd.read_pickle('dataframes/pro_articles.pkl')
comments = pd.read_pickle('dataframes/pro_comments.pkl')

In [4]:
LABELS = ('NEUTRAL', 'POSITIVE', 'NEGATIVE')

In [18]:
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)


@torch.no_grad()
def predict(tokens: list[str]) -> str:
    text = ' '.join(tokens)
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted.item()


In [48]:
tqdm.pandas(desc='sentiment')
comments['sentiment'] = comments['tokens'].progress_apply(predict)

sentiment:   0%|          | 0/18431 [00:00<?, ?it/s]

In [60]:
articles = pd.read_pickle('dataframes/pro_articles.pkl')
comments = pd.read_pickle('dataframes/analyzed_comments.pkl')

In [61]:
comments.drop('text', axis=1, inplace=True)
comments['tokens'] = comments['tokens'].apply(' '.join)

In [62]:
comments.dtypes

article_id                          int64
author                             object
published_datetime    datetime64[ns, UTC]
votes                               int64
tokens                             object
sentiment                           int64
dtype: object

In [63]:
report = ProfileReport(comments, title='Comments Report', explorative=True)
report.to_file("comments_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [64]:
report = ProfileReport(articles, title='Comments Report', explorative=True)
report.to_file("articles_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Простой'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]