In [1]:
!pip install datasets



In [2]:
!pip install transformers[torch] accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.1
    Uninstalling transformers-4.56.1:
      Successfully uninstalled transformers-4.56.1
Successfully installed transformers-4.56.2


In [3]:
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # классический SVM работает достаточно долго, воспользуемся линейным
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset #для более удобной работы с датасетами и правильного кодирования данных
from transformers import pipeline
import torch

In [5]:
# начем с того, что загрузим данные
data_train=pd.read_csv('/content/drive/MyDrive/Обучение/Кластеризация и классификация текстов/twitter_training.csv',header=None )
data_test=pd.read_csv('/content/drive/MyDrive/Обучение/Кластеризация и классификация текстов/twitter_validation.csv', header=None)

In [6]:
data_train.columns = ["id", "object", "sentiment", "text"]
data_test.columns = ["id", "object", "sentiment", "text"]

In [7]:
data_train.head()

Unnamed: 0,id,object,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [9]:
data_train = data_train[data_train['sentiment'].isin(['Positive', 'Negative'])]
data_test = data_test[data_test['sentiment'].isin(['Positive', 'Negative'])]

In [12]:
# уберем из работы все "нулевые" тексты
data_train=data_train[data_train['text'].str.len()!=0]

In [13]:
# Функция для предобработки текста
def preprocess_text(text):
    # Приведение текста к нижнему регистру
    text = str(text)
    text = text.lower()
    # Можно добавить другие шаги предобработки, например удаление стоп-слов
    return text



# Применение предобработки к тренировочным и тестовым данным
data_train['text'] = data_train['text'].apply(preprocess_text)
data_test['text'] = data_test['text'].apply(preprocess_text)


print(data_train['text'])

0        im getting on borderlands and i will murder yo...
1        i am coming to the borders and i will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    just realized that the windows partition of my...
74678    just realized that my mac window partition is ...
74679    just realized the windows partition of my mac ...
74680    just realized between the windows partition of...
74681    just like the windows partition of my mac is l...
Name: text, Length: 43374, dtype: object


In [14]:
# Разделение тренировочных данных на тренировочную и валидационную части
X_train, X_val, y_train, y_val = train_test_split(data_train['text'], data_train['sentiment'], test_size=0.2, random_state=42)

# Создание и обучение модели SVM
svm_model = make_pipeline(TfidfVectorizer(), LinearSVC())
svm_model.fit(X_train, y_train)

# Оценка модели на валидационном наборе
y_pred_svm = svm_model.predict(X_val)
print("SVM Classification Report:")
print(classification_report(y_val, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

    Negative       0.94      0.93      0.93      4577
    Positive       0.93      0.93      0.93      4098

    accuracy                           0.93      8675
   macro avg       0.93      0.93      0.93      8675
weighted avg       0.93      0.93      0.93      8675



In [15]:
# Применение SVM модели
y_test_pred_svm = svm_model.predict(data_test['text'])
data_test['sentiment_pred_svm'] = y_test_pred_svm

data_test.head(10)

Unnamed: 0,id,object,sentiment,text,sentiment_pred_svm
2,8312,Microsoft,Negative,@microsoft why do i pay for word when it funct...,Negative
3,4371,CS-GO,Negative,"csgo matchmaking is so full of closet hacking,...",Negative
5,6273,FIFA,Negative,hi @eahelp i’ve had madeleine mccann in my cel...,Negative
6,7925,MaddenNFL,Positive,thank you @eamaddennfl!! \n\nnew te austin hoo...,Positive
7,11332,TomClancysRainbowSix,Positive,"rocket league, sea of thieves or rainbow six: ...",Positive
8,1107,AssassinsCreed,Positive,my ass still knee-deep in assassins creed odys...,Positive
9,2069,CallOfDuty,Negative,fix it jesus ! please fix it ! what in the wor...,Negative
10,3185,Dota2,Positive,the professional dota 2 scene is fucking explo...,Positive
11,1172,AssassinsCreed,Positive,itching to assassinate \n\n#tccgif #assassinsc...,Positive
12,11783,Verizon,Negative,"@fredtjoseph hey fred, comcast cut the cable a...",Negative


In [16]:
# Создание и обучение модели Naive Bayes
nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
nb_model.fit(X_train, y_train)

# Оценка модели на валидационном наборе
y_pred_nb = nb_model.predict(X_val)
print("Naive Bayes Classification Report:")
print(classification_report(y_val, y_pred_nb))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.92      0.90      4577
    Positive       0.91      0.86      0.88      4098

    accuracy                           0.89      8675
   macro avg       0.89      0.89      0.89      8675
weighted avg       0.89      0.89      0.89      8675



In [17]:
# Применение Naive Bayes модели
y_test_pred_nb = nb_model.predict(data_test['text'])
data_test['sentiment_pred_nb'] = y_test_pred_nb

data_test.head(10)

Unnamed: 0,id,object,sentiment,text,sentiment_pred_svm,sentiment_pred_nb
2,8312,Microsoft,Negative,@microsoft why do i pay for word when it funct...,Negative,Negative
3,4371,CS-GO,Negative,"csgo matchmaking is so full of closet hacking,...",Negative,Negative
5,6273,FIFA,Negative,hi @eahelp i’ve had madeleine mccann in my cel...,Negative,Negative
6,7925,MaddenNFL,Positive,thank you @eamaddennfl!! \n\nnew te austin hoo...,Positive,Positive
7,11332,TomClancysRainbowSix,Positive,"rocket league, sea of thieves or rainbow six: ...",Positive,Positive
8,1107,AssassinsCreed,Positive,my ass still knee-deep in assassins creed odys...,Positive,Positive
9,2069,CallOfDuty,Negative,fix it jesus ! please fix it ! what in the wor...,Negative,Negative
10,3185,Dota2,Positive,the professional dota 2 scene is fucking explo...,Positive,Positive
11,1172,AssassinsCreed,Positive,itching to assassinate \n\n#tccgif #assassinsc...,Positive,Positive
12,11783,Verizon,Negative,"@fredtjoseph hey fred, comcast cut the cable a...",Negative,Negative
