In [1]:
import os
# import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import numpy as np

In [2]:
def load_data(directory):
    reviews = []
    labels = []
    
    for label in ['pos', 'neg']:
        label_dir = os.path.join(directory, label)
        for filename in os.listdir(label_dir):
            if filename.endswith('.txt'):
                with open(os.path.join(label_dir, filename), 'r', encoding='utf-8') as file:
                    reviews.append(file.read())
                    labels.append(1 if label == 'pos' else 0)  # 1 для положительных, 0 для отрицательных

    return reviews, labels

In [3]:
train_reviews, train_labels = load_data('/kaggle/input/greenatom-dataset/test/test')
test_reviews, test_labels = load_data('/kaggle/input/greenatom-dataset/train/train')

In [4]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(train_reviews)
X_test_vectorized = vectorizer.transform(test_reviews)

In [5]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, train_labels)

In [6]:
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(test_labels, y_pred)

In [7]:
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(test_labels, y_pred))

Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.85      0.88      0.87     12500
           1       0.88      0.85      0.86     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [8]:
import numpy as np

In [9]:
simple_review = ["An absolute masterpiece! The acting was phenomenal, and the storyline kept me hooked from beginning to end. A must-watch!",
                 "I loved every moment of this film. The visuals were stunning, and the characters felt so real. It’s one of the best movies I’ve seen in a long time.",
                 "This movie exceeded my expectations. The direction was brilliant, and the emotional depth of the plot was incredible. Highly recommend it!",
                 "I found the movie to be a complete disappointment. The plot was predictable, and the characters were poorly developed.",
                 "What a waste of time! The film was boring and dragged on with no real climax. I wouldn’t recommend watching it.",
                 "Terrible acting and a weak storyline. I couldn’t wait for the movie to end. Definitely not worth the hype."]

for i in simple_review:
    print(i)
    vectorized_simple_review = vectorizer.transform([i])
    predict = model.predict(vectorized_simple_review)
    predict_probablities = model.predict_proba(vectorized_simple_review)
    print(f"Вероятность принадлежности к классу {predict[0]} ({'Положительный' if predict == 1 else 'Отрицательный'}) : {round(np.max(predict_probablities)*100)}% \n")

An absolute masterpiece! The acting was phenomenal, and the storyline kept me hooked from beginning to end. A must-watch!
Вероятность принадлежности к классу 1 (Положительный) : 94% 

I loved every moment of this film. The visuals were stunning, and the characters felt so real. It’s one of the best movies I’ve seen in a long time.
Вероятность принадлежности к классу 1 (Положительный) : 95% 

This movie exceeded my expectations. The direction was brilliant, and the emotional depth of the plot was incredible. Highly recommend it!
Вероятность принадлежности к классу 1 (Положительный) : 97% 

I found the movie to be a complete disappointment. The plot was predictable, and the characters were poorly developed.
Вероятность принадлежности к классу 0 (Отрицательный) : 99% 

What a waste of time! The film was boring and dragged on with no real climax. I wouldn’t recommend watching it.
Вероятность принадлежности к классу 0 (Отрицательный) : 98% 

Terrible acting and a weak storyline. I couldn’t 