In [2]:
import string

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import text2text as t2t
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split


# nltk.download('punkt_tab')
# nltk.download('stopwords')



In [66]:
stop_words = set(stopwords.words('russian'))
stemmer = SnowballStemmer("russian")

In [103]:
vectorizer = CountVectorizer(stop_words=list(stop_words))

In [86]:
vectorizer = TfidfVectorizer(stop_words=list(stop_words))

In [48]:
allowed_symbols = (
        string.ascii_lowercase +
        " -+%.," +
        # string.digits +
        "абвгдежзийклмнопрстуфхцчшщъыьэюя"
)
def getClearText(text: str) -> str:
    cleared_text = ""
    for letter in text:
        if letter in allowed_symbols:
            cleared_text += letter

    return cleared_text

replace_dict = {
    " +": " плюс ",
    " -": " минус ",
    "%": " процент "
}
def replaceSymbols(text: str) -> str:
    new_text = text
    for key, value in replace_dict.items():
        new_text = new_text.replace(key, value)
    return new_text


In [92]:

df = pd.read_csv("database_some.csv")
# df = df.iloc[:1000]

df_texts = df.Text
# normalized to [0, 1]

# 



# Continuous

In [133]:
values = [(value + 1) / 2 for value in df.Value]

# Binary

In [93]:
values = np.array([-1 if value < 0 else 1 for value in df.Value])

In [134]:

cleared = []
for text in df_texts:
    text = text.lower()
    cleared_text = getClearText(text)
    cleared_text = replaceSymbols(cleared_text)
    cleared.append(cleared_text)

# texts = []
# for text in cleared:
#     words = word_tokenize(text)
#     words_filtered = []
#     for word in words:
#         if word in stop_words:
#             continue
#         word = stemmer.stem(word)
#         words_filtered.append(word)
#
#     text = ""
#     for word in words_filtered:
#         text += word + " "
#     # tokenized_sentences = sent_tokenize(text)
#
#     texts.append(text)

vectorized = vectorizer.fit_transform(cleared)



# Split data


In [135]:
df_vectorized = pd.DataFrame(vectorized[0].T.todense(),
                  index=vectorizer.get_feature_names_out(), columns=["TF-IDF"])
df_vectorized = df_vectorized.sort_values('TF-IDF', ascending=False)
print(np.unique(df_vectorized.values).shape)
df_vectorized


(12,)


Unnamed: 0,TF-IDF
процент,25
индекс,11
отметке,10
оао,8
торги,7
...,...
инград,0
ингосстрах,0
инг,0
инвестфонды,0


# Continuous values

In [None]:
num_classes = 3  # Match the number of classes in the data
bin_edges = np.linspace(0, 1, num_classes + 1)  # Create bin edges
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2  # Compute bin centers

# Digitize values into bins
y_binned = np.digitize(values, bins=bin_edges, right=True) - 1  # Adjust to 0-based indexing

In [None]:

random_state = 3
test_size = 0.05

x_train, x_test, y_train, y_test = train_test_split(
    vectorized,
    y_binned,
    test_size=test_size,
    random_state=random_state
)
x_original_train, x_original_test, _, _ = train_test_split(
df_texts,
    y_binned,
    test_size=test_size,
    random_state=random_state
)


# Binary

In [131]:
random_state = 3
test_size = 0.05

x_train, x_test, y_train, y_test = train_test_split(
    vectorized,
    values,
    test_size=test_size,
    random_state=random_state
)
x_original_train, x_original_test, _, _ = train_test_split(
df_texts,
    values,
    test_size=test_size,
    random_state=random_state
)


In [None]:

# Train logistic regression
model = LogisticRegression(
    random_state=random_state,
    max_iter=1900 # 1900
)
model.fit(x_train, y_train)



# Вариант для continuous values

In [150]:

# Predict probabilities
y_probs = model.predict_proba(x_test)

# Map probabilities back to continuous values
y_pred_continuous = np.dot(y_probs, bin_centers)

# Convert predictions back to original scale [-1, 1]
original_values = [2 * value - 1 for value in y_pred_continuous]


# print("Continuous Predictions:", original_values)
x_test_texts = vectorizer.inverse_transform(x_test)  # List of tokenized words per row

# Combine x_test with predictions into a DataFrame
result_df = pd.DataFrame({
    "OriginalText": x_original_test,
    "VectorizedText": [" ".join(text) for text in x_test_texts],
    "Predicted_Value": original_values,
    "ActualValue": y_test
})

result_df.to_csv("predicted.csv")

print(result_df)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

                                           OriginalText  \
1959  © Reuters. Рынок акций Московской биржи по сос...   
2879  © Reuters\nInvesting.com — Обыкновенные акции ...   
322   © Reuters. Рынок акций Московской биржи по сос...   
2428  Investing.com — Несмотря на улучшение платежно...   
2964  © Reuters. Рынок акций Московской биржи по сос...   
...                                                 ...   
2724  © Reuters. Рынок акций Московской биржи по сос...   
3567  © Reuters. Рынок акций Московской биржи по сос...   
3374  © Reuters.\nInvesting.com — Российский IT-гига...   
115   В понедельник, 10 октября, ожидаются выплаты к...   
3227  © Пресс-служба банка ВТБ ВТБ продал 4,9% "Магн...   

                                         VectorizedText  Predicted_Value  \
1959  reuters рынок акций индекс процент торги моско...        -0.001086   
2879  процент com московской роста акции подорожали ...         0.278654   
322   reuters рынок акций индекс процент торги моско...        

# Вариант для бинарный values

In [130]:

# Predict probabilities
y_probs = model.predict_proba(x_test)
y_predict = model.predict(x_test)

# Map probabilities back to continuous values

# print("Unique classes in y_binned:", np.unique(y_binned))

# Convert predictions back to original scale [-1, 1]

# print("Continuous Predictions:", original_values)
x_test_texts = vectorizer.inverse_transform(x_test)  # List of tokenized words per row
print(y_predict)
# Combine x_test with predictions into a DataFrame
result_df = pd.DataFrame({
    "OriginalText": x_original_test,
    "VectorizedText": [" ".join(text) for text in x_test_texts],
    "Predicted_Value": y_predict,
    "ActualValue": y_test
})

result_df.to_csv("predicted.csv")

print(result_df)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

[-1  1  1 ... -1 -1 -1]
                                           OriginalText  \
1959  © Reuters. Рынок акций Московской биржи по сос...   
2879  © Reuters\nInvesting.com — Обыкновенные акции ...   
322   © Reuters. Рынок акций Московской биржи по сос...   
2428  Investing.com — Несмотря на улучшение платежно...   
2964  © Reuters. Рынок акций Московской биржи по сос...   
...                                                 ...   
1735  © Reuters. Рынок акций Московской биржи по сос...   
3681  Investing.com — ЦБ полагает, что вложения в эк...   
3547  © Reuters.\nInvesting.com — Аналитики брокера ...   
3543  © Reuters. Рынок акций Московской биржи по сос...   
3896  © Reuters. "Яндекс" увеличил выручку в 3-м ква...   

                                         VectorizedText  Predicted_Value  \
1959  reuters рынок акций индекс процент торги моско...               -1   
2879  процент com московской роста акции подорожали ...                1   
322   reuters рынок акций индекс процен