In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train_data = pd.read_csv('/kaggle/input/ef-msu-2024-comp-2/train.csv')
test_data = pd.read_csv('/kaggle/input/ef-msu-2024-comp-2/test.csv')

# для формальности
train_data['Review'] = train_data['Review'].fillna('')
test_data['Review'] = test_data['Review'].fillna('')

X_train = train_data['Review']
y_train = train_data['label']
X_test = test_data['Review'].astype(str)

vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

from sklearn.preprocessing import normalize

# Нормализация матрицы TF-IDF по длине документа
X_train_tfidf = normalize(X_train_tfidf, norm='l2', axis=1)
X_test_tfidf = normalize(X_test_tfidf, norm='l2', axis=1)

rf_model = RandomForestClassifier(n_estimators=200, random_state= 2)
lr_model = LogisticRegression(max_iter=200, random_state= 2)

# объединение с помощью Voting Classifier
voting_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('lr', lr_model)
], voting='soft')

voting_model.fit(X_train_tfidf, y_train)

y_pred_proba = voting_model.predict_proba(X_test_tfidf)

test_data['probability'] = y_pred_proba[:, 1]  # вероятность принадлежности к классу 1
output = test_data[['id', 'probability']]
output.to_csv('submission.csv', index=False)