In [1]:
import string

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /home/almaz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/almaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('russian'))
stemmer = SnowballStemmer("russian")

In [37]:
allowed_symbols = (
        string.ascii_lowercase +
        " -+%.," +
        string.digits +
        "абвгдежзийклмнопрстуфхцчшщъыьэюя"
)
def getClearText(text: str) -> str:
    cleared_text = ""
    for letter in text:
        if letter in allowed_symbols:
            cleared_text += letter

    return cleared_text

replace_dict = {
    " +": " плюс ",
    " -": " минус ",
    "%": " процент "
}
def replaceSymbols(text: str) -> str:
    new_text = text
    for key, value in replace_dict.items():
        new_text = new_text.replace(key, value)
    return new_text

In [38]:
random_state = 3
test_size = 0.05

In [47]:
df = pd.read_csv("database_some.csv")
df_texts = df.Text

# Continuous (**optional**)

In [58]:
values = [(value + 1) / 2 for value in df.Value]

# Binary (**optional**)

In [48]:
values = np.array([-1 if value < 0 else 1 for value in df.Value])

In [41]:
vectorizer = CountVectorizer(stop_words=list(stop_words))

In [59]:
vectorizer = TfidfVectorizer(stop_words=list(stop_words))

In [60]:
cleared = []
for text in df_texts:
    text = text.lower()
    cleared_text = getClearText(text)
    cleared_text = replaceSymbols(cleared_text)
    cleared.append(cleared_text)

vectorized = vectorizer.fit_transform(cleared)

# Отображение коэффициентов для слов

In [61]:
df_vectorized = pd.DataFrame(vectorized[0].T.todense(),
                  index=vectorizer.get_feature_names_out(), columns=["TF-IDF"])
df_vectorized = df_vectorized.sort_values('TF-IDF', ascending=False)
df_vectorized.head(20)

(166,)


Unnamed: 0,TF-IDF
отметке,0.324247
оао,0.264829
процент,0.237364
закрывшись,0.197419
мосбиржи,0.183469
900,0.177425
отметки,0.156146
котировки,0.152221
000,0.144236
индекс,0.134822


# Continuous values (**optional**)

In [73]:
num_classes = 3  # Match the number of classes in the data
bin_edges = np.linspace(0, 1, num_classes + 1)  # Create bin edges
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2  # Compute bin centers

# Digitize values into bins
values = np.digitize(values, bins=bin_edges, right=True) - 1  # Adjust to 0-based indexing

In [74]:
x_train, x_test, y_train, y_test = train_test_split(
    vectorized,
    values,
    test_size=test_size,
    random_state=random_state
)

# so we can get original text but not tokenized
x_original_train, x_original_test, _, _ = train_test_split(
    df_texts,
    values,
    test_size=test_size,
    random_state=random_state
)

In [75]:
model = LogisticRegression(
    random_state=random_state,
    max_iter=1900 # 1900
)
model.fit(x_train, y_train)

In [76]:
x_test_texts = vectorizer.inverse_transform(x_test)  # List of tokenized words per row

# Вариант для continuous values

In [77]:
# Predict probabilities
y_probs = model.predict_proba(x_test)

# Map probabilities back to continuous values
y_pred_continuous = np.dot(y_probs, bin_centers)

# Convert predictions back to original scale [-1, 1]
y_predict = [2 * value - 1 for value in y_pred_continuous]

# Вариант для бинарный values

In [56]:
# Predict probabilities
y_probs = model.predict_proba(x_test)
y_predict = model.predict(x_test)

In [67]:
# Combine x_test with predictions into a DataFrame
result_df = pd.DataFrame({
    "OriginalText": x_original_test,
    "VectorizedText": [" ".join(text) for text in x_test_texts],
    "Predicted_Value": y_predict,
    "ActualValue": y_test
})

result_df.to_csv("predicted.csv")

print(result_df)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

                                           OriginalText  \
1464  © Reuters. Рынок акций Московской биржи по сос...   
2518  © Reuters\nInvesting.com — Со вторника, 17 сен...   
4634  © Reuters. Рынок акций Московской биржи по сос...   
521   © Reuters. Рынок акций Московской биржи по сос...   
203   © Reuters. Рынок акций Московской биржи по сос...   
...                                                 ...   
519   © Reuters. Рынок акций Московской биржи по сос...   
261   © Reuters. Рынок акций Московской биржи по сос...   
884   © Reuters. Рынок акций Московской биржи по сос...   
382   © Reuters. Рынок акций Московской биржи по сос...   
115   В понедельник, 10 октября, ожидаются выплаты к...   

                                         VectorizedText  Predicted_Value  \
1464  reuters рынок акций индекс процент торги моско...         0.371498   
2518  процент com уровне 23 21 вторника 20 17 рынка ...         0.035911   
4634  reuters рынок акций индекс 46 процент торги мо...        

ValueError: Expected 2D array, got scalar array instead:
array=акции сбербанка выросли на +50%.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.