In [28]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score, classification_report
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Обучение/Большие языковые модели на практике/sent_train.csv')

In [4]:
df_valid = pd.read_csv('/content/drive/MyDrive/Обучение/Большие языковые модели на практике/sent_valid.csv')

In [5]:
df_valid.head()

Unnamed: 0,text,label
0,$ALLY - Ally Financial pulls outlook https://t...,0
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0
2,$PRTY - Moody's turns negative on Party City h...,0
3,$SAN: Deutsche Bank cuts to Hold,0
4,$SITC: Compass Point cuts to Sell,0


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9543 entries, 0 to 9542
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9543 non-null   object
 1   label   9543 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 149.2+ KB


In [7]:
import re

# Функция для очистки текста
def clean_text(text):
    if isinstance(text, str):
        # Удаляем упоминания пользователей (@username)
        text = re.sub(r'@[A-Za-z0-9_]+', '', text)
        # Удаляем URL
        text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
        # Удаляем хэштеги (но оставляем текст)
        text = re.sub(r'#', '', text)
        # Удаляем специальные символы и цифры
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Приводим к нижнему регистру
        text = text.lower()
        # Удаляем лишние пробелы
        text = ' '.join(text.split())
        return text
    else:
        return ''

# Применяем очистку
df_train['cleaned'] = df_train['text'].apply(clean_text)
df_valid['cleaned'] = df_valid['text'].apply(clean_text)

In [8]:
df_train['cleaned'].head()

Unnamed: 0,cleaned
0,bynd jpmorgan reels in expectations on beyond ...
1,ccl rcl nomura points to bookings weakness at ...
2,cx cemex cut at credit suisse jp morgan on wea...
3,ess btig research cuts to neutral
4,fnko funko slides after piper jaffray pt cut


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Токенизация
    tokens = text.split()
    # Удаляем стоп-слова и применяем стемминг
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# Применяем дополнительную обработку
df_train['processed'] = df_train['cleaned'].apply(preprocess_text)
df_valid['processed'] = df_valid['cleaned'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
df_train['processed'].head()

Unnamed: 0,processed
0,bynd jpmorgan reel expect beyond meat
1,ccl rcl nomura point book weak carniv royal ca...
2,cemex cut credit suiss morgan weak build outlook
3,ess btig research cut neutral
4,fnko funko slide piper jaffray cut


КЛАССИФИКАЦИЯ С ИСПОЛЬЗОВАНИЕМ OPENAI API


In [14]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
from google.colab import userdata


In [16]:
# Запросите данные у пользователя
TOKEN = input("Введите ваш API токен: ")
BASE_URL = input("Введите BASE_URL (или нажмите Enter для OpenAI): ")

if not BASE_URL:
    BASE_URL = "https://api.openai.com/v1"

client = OpenAI(api_key=TOKEN, base_url=BASE_URL)

Введите ваш API токен: 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
Введите BASE_URL (или нажмите Enter для OpenAI): https://api.openai.com/v1


In [17]:
def generate_answer(client, instruc, input, indic, temperature=0):
    full_prompt = instruc + input + indic

    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": full_prompt}],
        temperature=temperature
    )

    return completion.choices[0].message.content

**1: Zero-shot классификация**

In [18]:
def zero_shot_classification(texts):
    instruction = """Classify the financial tweet sentiment into one of these three categories:
0: Bearish - downward trend
1: Bullish - upward trend
2: Neutral - no change or neutral

Return only the number (0, 1, or 2) without any explanation.

Tweet: """

    predictions = []
    for text in texts:
        try:
            response = generate_answer(client, instruction, text, "\nCategory:")
            # Извлекаем число из ответа
            if '0' in response:
                predictions.append(0)
            elif '1' in response:
                predictions.append(1)
            elif '2' in response:
                predictions.append(2)
            else:
                # Если не нашли число, используем нейтральный класс по умолчанию
                predictions.append(2)
        except Exception as e:
            print(f"Error processing text: {e}")
            predictions.append(2)  # нейтральный по умолчанию

    return predictions

**2: Few-shot классификация**

In [19]:
def few_shot_classification(texts):
    instruction = """Classify the financial tweet sentiment into one of these three categories:
0: Bearish - downward trend, negative sentiment
1: Bullish - upward trend, positive sentiment
2: Neutral - no change or neutral sentiment

Examples:
Tweet: $AAPL stock plunges 10% after earnings miss
Category: 0

Tweet: $TSLA shares surge to new all-time high
Category: 1

Tweet: $MSFT announces quarterly dividend unchanged
Category: 2

Tweet: $AMZN reports steady growth in cloud division
Category: 2

Tweet: $JPM cuts 5000 jobs amid restructuring
Category: 0

Tweet: $NVDA launches groundbreaking new AI chip
Category: 1

Now classify this tweet. Return only the number (0, 1, or 2) without any explanation.

Tweet: """

    predictions = []
    for text in texts:
        try:
            response = generate_answer(client, instruction, text, "\nCategory:")
            # Извлекаем число из ответа
            if '0' in response:
                predictions.append(0)
            elif '1' in response:
                predictions.append(1)
            elif '2' in response:
                predictions.append(2)
            else:
                predictions.append(2)  # нейтральный по умолчанию
        except Exception as e:
            print(f"Error processing text: {e}")
            predictions.append(2)

    return predictions

**3: Chain-of-Thought классификация**

In [20]:
def chain_of_thought_classification(texts):
    instruction = """Analyze this financial tweet step by step and classify its sentiment.
First, identify if the tweet mentions stock prices, financial performance, or market trends.
Then, determine if the sentiment is positive (bullish), negative (bearish), or neutral.
Finally, output only the corresponding number:
0 for Bearish (downward trend, negative)
1 for Bullish (upward trend, positive)
2 for Neutral (no change or neutral)

Tweet: """

    predictions = []
    for text in texts:
        try:
            response = generate_answer(client, instruction, text, "\nAfter analysis, the category is:")
            # Извлекаем число из ответа
            if '0' in response:
                predictions.append(0)
            elif '1' in response:
                predictions.append(1)
            elif '2' in response:
                predictions.append(2)
            else:
                predictions.append(2)
        except Exception as e:
            print(f"Error processing text: {e}")
            predictions.append(2)

    return predictions

In [25]:
# Тестируем на подмножестве данных
sample_size = min(200, len(df_valid))
sample_texts = df_valid['text'].head(sample_size).tolist()
true_labels = df_valid['label'].head(sample_size).tolist()

print(f"\nТестируем на {sample_size} примерах")


Тестируем на 200 примерах


**1: TF-IDF + Logistic Regression**

In [26]:
train_texts = df_train['processed'].tolist()
train_labels = df_train['label'].tolist()

In [29]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

logreg_classifier = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced'
)

tfidf_pipeline = make_pipeline(tfidf_vectorizer, logreg_classifier)
tfidf_pipeline.fit(train_texts, train_labels)

In [30]:
# Предсказания на валидации
tfidf_preds = tfidf_pipeline.predict(sample_texts)
tfidf_f1 = f1_score(true_labels, tfidf_preds, average='macro')

print(f"TF-IDF + LogisticRegression Macro F1: {tfidf_f1:.4f}")
print(classification_report(true_labels, tfidf_preds))

TF-IDF + LogisticRegression Macro F1: 0.6662
              precision    recall  f1-score   support

           0       0.72      0.38      0.50        55
           1       0.77      0.75      0.76        64
           2       0.64      0.86      0.74        81

    accuracy                           0.69       200
   macro avg       0.71      0.67      0.67       200
weighted avg       0.71      0.69      0.68       200



**2: FinBERT**

In [31]:
finbert = pipeline("text-classification",
                      model="yiyanghkust/finbert-tone",
                      tokenizer="yiyanghkust/finbert-tone")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [32]:
def finbert_classification(texts):
        predictions = []
        for text in tqdm(texts, desc="FinBERT classification"):
            try:
                result = finbert(text[:512])
                label = result[0]['label']
                if label == 'Negative':
                    predictions.append(0)
                elif label == 'Positive':
                    predictions.append(1)
                else:
                    predictions.append(2)
            except Exception as e:
                predictions.append(2)
        return predictions

In [33]:
finbert_preds = finbert_classification(sample_texts)
finbert_f1 = f1_score(true_labels, finbert_preds, average='macro')

FinBERT classification: 100%|██████████| 200/200 [00:40<00:00,  4.96it/s]


In [34]:
print(f"FinBERT Macro F1 Score: {finbert_f1:.4f}")
print(classification_report(true_labels, finbert_preds))

FinBERT Macro F1 Score: 0.7978
              precision    recall  f1-score   support

           0       0.85      0.62      0.72        55
           1       0.93      0.81      0.87        64
           2       0.72      0.93      0.81        81

    accuracy                           0.81       200
   macro avg       0.83      0.79      0.80       200
weighted avg       0.82      0.81      0.80       200



**3: RoBERTa**

In [35]:
roberta = pipeline("text-classification",
                      model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                      tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [36]:
def roberta_classification(texts):
        predictions = []
        for text in tqdm(texts, desc="RoBERTa classification"):
            try:
                result = roberta(text[:512])
                label = result[0]['label']
                # Маппинг: LABEL_0 - negative, LABEL_1 - neutral, LABEL_2 - positive
                if label == 'LABEL_0':
                    predictions.append(0)  # Bearish
                elif label == 'LABEL_2':
                    predictions.append(1)  # Bullish
                else:
                    predictions.append(2)  # Neutral
            except Exception as e:
                predictions.append(2)
        return predictions

In [37]:
roberta_preds = roberta_classification(sample_texts)
roberta_f1 = f1_score(true_labels, roberta_preds, average='macro')

RoBERTa classification: 100%|██████████| 200/200 [00:38<00:00,  5.22it/s]


In [38]:
print(f"RoBERTa Macro F1 Score: {roberta_f1:.4f}")
print(classification_report(true_labels, roberta_preds))

RoBERTa Macro F1 Score: 0.1922
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        55
           1       0.00      0.00      0.00        64
           2       0.41      1.00      0.58        81

    accuracy                           0.41       200
   macro avg       0.14      0.33      0.19       200
weighted avg       0.16      0.41      0.23       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**4: DistilBERT с Zero-shot промптингом**

In [39]:
distilbert = pipeline("text-classification",
                         model="distilbert-base-uncased-finetuned-sst-2-english")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [40]:
def distilbert_zero_shot(texts):
        predictions = []
        for text in tqdm(texts, desc="DistilBERT classification"):
            try:
                result = distilbert(text[:512])
                label = result[0]['label']
                score = result[0]['score']

                # Маппинг общих sentiment меток к финансовым
                if label == 'NEGATIVE':
                    predictions.append(0)  # Bearish
                elif label == 'POSITIVE':
                    predictions.append(1)  # Bullish
                else:
                    predictions.append(2)  # Neutral
            except Exception as e:
                predictions.append(2)
        return predictions

In [42]:
distilbert_preds = distilbert_zero_shot(sample_texts)
distilbert_f1 = f1_score(true_labels, distilbert_preds, average='macro')

DistilBERT classification: 100%|██████████| 200/200 [00:20<00:00,  9.69it/s]


In [43]:
print(f"DistilBERT Zero-shot Macro F1: {distilbert_f1:.4f}")
print(classification_report(true_labels, distilbert_preds))

DistilBERT Zero-shot Macro F1: 0.2653
              precision    recall  f1-score   support

           0       0.32      0.95      0.47        55
           1       0.46      0.25      0.32        64
           2       0.00      0.00      0.00        81

    accuracy                           0.34       200
   macro avg       0.26      0.40      0.27       200
weighted avg       0.23      0.34      0.23       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**5: BART**

In [44]:
bart = pipeline("text2text-generation",
                   model="facebook/bart-base",
                   max_length=50)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [45]:
def bart_classification(texts):
        predictions = []
        for text in tqdm(texts, desc="BART classification"):
            try:
                prompt = f"Classify this financial sentiment as 0 (bearish), 1 (bullish), or 2 (neutral): {text}"
                result = bart(prompt, max_length=10, num_return_sequences=1)
                response = result[0]['generated_text']

                # Извлекаем ответ
                if '0' in response:
                    predictions.append(0)
                elif '1' in response:
                    predictions.append(1)
                elif '2' in response:
                    predictions.append(2)
                else:
                    predictions.append(2)  # По умолчанию neutral
            except Exception as e:
                predictions.append(2)
        return predictions

In [46]:
bart_preds = bart_classification(sample_texts)
bart_f1 = f1_score(true_labels, bart_preds, average='macro')

BART classification: 100%|██████████| 200/200 [03:34<00:00,  1.07s/it]


In [48]:
print(f"BART Macro F1 Score: {bart_f1:.4f}")
print(classification_report(true_labels, bart_preds))

BART Macro F1 Score: 0.1438
              precision    recall  f1-score   support

           0       0.28      1.00      0.43        55
           1       0.00      0.00      0.00        64
           2       0.00      0.00      0.00        81

    accuracy                           0.28       200
   macro avg       0.09      0.33      0.14       200
weighted avg       0.08      0.28      0.12       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**СРАВНЕНИЕ РЕЗУЛЬТАТОВ**

In [49]:
results = {
    'TF-IDF + LogisticRegression': tfidf_f1,
    'FinBERT': finbert_f1 if 'finbert_f1' in locals() else 0,
    'RoBERTa': roberta_f1 if 'roberta_f1' in locals() else 0,
    'DistilBERT Zero-shot': distilbert_f1 if 'distilbert_f1' in locals() else 0,
    'BART': bart_f1 if 'bart_f1' in locals() else 0
}

In [53]:
if results:
    best_model = max(results, key=results.get)
    best_score = results[best_model]

    print(f"ЛУЧШАЯ МОДЕЛЬ: {best_model} с Macro F1 = {best_score:.4f}")

    print("\nВсе результаты:")
    for model, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
        print(f"  {model:30} | F1: {score:.4f}")

ЛУЧШАЯ МОДЕЛЬ: FinBERT с Macro F1 = 0.7978

Все результаты:
  FinBERT                        | F1: 0.7978
  TF-IDF + LogisticRegression    | F1: 0.6662
  DistilBERT Zero-shot           | F1: 0.2653
  RoBERTa                        | F1: 0.1922
  BART                           | F1: 0.1438
