# Задание 1

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings("ignore")
import os
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
import spacy
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [None]:
class BbcDataset(object):
    def __init__(self):
        self.DATA_DIR = "bbc/"
        self.classes = os.listdir(self.DATA_DIR)
        self.n_documents = len([os.path.join(path, name) for path, subdirectory, files in os.walk(self.DATA_DIR) for name in files])

    def get_dataset(self):
        data = []
        for file in [os.path.join(path, name) for path, subdirectory, files in os.walk(self.DATA_DIR) for name in files]:
            label = file.split("/")[-2]
            content = " ".join(open(file, encoding='utf-8', errors='ignore').read().splitlines())
            data.append((content, label))
        return data

    def get_pandas_alike_dataset(self):
        data = self.get_dataset()
        return pd.DataFrame(data, columns=["text", "label"])

In [None]:
df = BbcDataset().get_pandas_alike_dataset()
df = df[df.label != "bbc"]
df['label'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: label, dtype: int64

In [None]:
class DataPreproces(object):
    def __init__(self, df, df_text):
        self.df = df
        self.df_text = df_text
        self.regex = re.compile("[A-Za-z]+")
        self.mystopwords = stopwords.words('english') 

    def words_only(self, text):
        return " ".join(self.regex.findall(text))

    def remove_stopwords(self, text):
        try:
            return " ".join([token for token in text.split() if not token in self.mystopwords])
        except:
            return ""

    def parse_filter_document(self, text):
        filtered_doc = []
        for token in text:
            if token.is_stop == False | token.is_punct == False | token.is_space == False:
                if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
                  filtered_doc.append(token.lemma_)

        return ' '.join(filtered_doc)
    
    def final_dataset(self):
        self.df_text = self.df_text.str.lower()
        self.df_text = self.df_text.apply(self.words_only)
        self.df_text = self.df_text.apply(self.remove_stopwords)
        
        nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
        
        self.df['processed_text'] = self.df_text.apply(lambda x: self.parse_filter_document(nlp(x)))
        
        data = self.df[['processed_text','label']]

        enc = LabelEncoder()
        data['label'] = enc.fit_transform(data['label'])
        
        return data

In [None]:
data = DataPreproces(df, df.text).final_dataset()
data

Unnamed: 0,processed_text,label
1,election deal falter heath role tory fail hold...,2
2,stress gap public trust handle economy restore...,2
3,stalemate pension strike talk talk aim avert n...,2
4,tory candidate quit remark conservative electi...,2
5,act detention rule urge government act quickly...,2
...,...,...
2221,stormy year property insurer stre storm typhoo...,0
2222,german growth go reverse economy shrink month ...,0
2223,oil company russian setback international oil ...,0
2224,boss payout director agree pay include pocket ...,0


In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(data['processed_text'],data['label'], test_size=0.2, shuffle=True)

X_train, X_val, y_train, y_val = train_test_split(data['processed_text'],data['label'], test_size=0.1, shuffle=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape)


(2002,) (2002,) (445,) (445,) (223,) (223,)


## Часть 1. Классические алгоритмы машинного обучения

In [None]:
mystopwords = stopwords.words('english')

tfidf = TfidfVectorizer(stop_words=mystopwords)

X_train_vec = tfidf.fit_transform(X_train).toarray()

X_test_vec = tfidf.transform(X_test).toarray()

X_val_vec = tfidf.transform(X_val).toarray()


In [None]:
def Model(models, X_train, X_test, X_val, y_train, y_test, y_val):    

    for key in models.keys():
    
        model = models[key]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        print(key)
        print("Train quality: \n")
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_train, y_train_pred), 
                                                                                      precision_score(y_train, y_train_pred, average='macro'), 
                                                                                      recall_score(y_train, y_train_pred, average='macro'), 
                                                                                      f1_score(y_train, y_train_pred, average='macro')))
        print("Test quality: ")
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred), 
                                                                                      precision_score(y_test, y_pred, average='macro'), 
                                                                                      recall_score(y_test, y_pred, average='macro'), 
                                                                                      f1_score(y_test, y_pred, average='macro')))
        print("Validation quality: ")
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_val, y_val_pred), 
                                                                                      precision_score(y_val, y_val_pred, average='macro'), 
                                                                                      recall_score(y_val, y_val_pred, average='macro'), 
                                                                                      f1_score(y_val, y_val_pred, average='macro')))
        print("_______________________\n")
 
    return

models = {
    'DecisionTree': DecisionTreeClassifier(criterion='gini', max_depth=100, min_samples_split= 2),  
    'LogisticRegression': LogisticRegression(), 
    'RandomForestClassifier': RandomForestClassifier(criterion='entropy', max_depth= 50, n_estimators= 150)
}

Model(models, X_train_vec, X_test_vec, X_val_vec, y_train, y_test, y_val)

 

DecisionTree
Train quality: 

Accuracy: 1.000 	Precision: 1.000 	Recall: 1.000 		F1: 1.000

Test quality: 
Accuracy: 0.978 	Precision: 0.977 	Recall: 0.977 		F1: 0.977

Validation quality: 
Accuracy: 0.812 	Precision: 0.808 	Recall: 0.809 		F1: 0.806

_______________________

LogisticRegression
Train quality: 

Accuracy: 0.995 	Precision: 0.995 	Recall: 0.994 		F1: 0.994

Test quality: 
Accuracy: 0.993 	Precision: 0.993 	Recall: 0.993 		F1: 0.993

Validation quality: 
Accuracy: 0.951 	Precision: 0.954 	Recall: 0.953 		F1: 0.953

_______________________

RandomForestClassifier
Train quality: 

Accuracy: 1.000 	Precision: 1.000 	Recall: 1.000 		F1: 1.000

Test quality: 
Accuracy: 0.996 	Precision: 0.995 	Recall: 0.995 		F1: 0.995

Validation quality: 
Accuracy: 0.942 	Precision: 0.947 	Recall: 0.940 		F1: 0.942

_______________________



## Часть 2. DistilBert

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer.max_model_input_sizes

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [None]:
train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.to_list(), truncation=True, padding=True)
val_encodings = tokenizer(X_val.to_list(), truncation=True, padding=True)


In [None]:
class BBC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = BBC_Dataset(train_encodings, y_train.to_list())
test_dataset = BBC_Dataset(test_encodings, y_test.to_list())
val_dataset = BBC_Dataset(val_encodings, y_val.to_list())


In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5).to(device)


Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


training_args = TrainingArguments(
    output_dir='./outputs',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="steps",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,                       
    args=training_args,     
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics= compute_metrics
)


Using cuda_amp half precision backend


In [None]:
trainer.train()
trainer.save_model("bbc_model")


***** Running training *****
  Num examples = 2002
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 753


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.7916,0.260704,0.937079,0.934699,0.93909,0.934479
100,0.2027,0.21167,0.955056,0.952721,0.95692,0.952044
150,0.1671,0.137401,0.966292,0.966388,0.968643,0.965825
200,0.2117,0.148852,0.968539,0.967547,0.969053,0.96735
250,0.1877,0.138316,0.973034,0.972543,0.975,0.971513
300,0.0413,0.094329,0.98427,0.983619,0.984079,0.983524
350,0.0653,0.124083,0.977528,0.976719,0.977372,0.97718
400,0.0483,0.03442,0.993258,0.993095,0.992864,0.993412
450,0.0867,0.04304,0.993258,0.993095,0.992864,0.993412
500,0.072,0.051138,0.988764,0.988847,0.98887,0.989017


***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 445
  Batch siz

In [None]:
q=[trainer.evaluate(eval_dataset=data) for data in [train_dataset, test_dataset, val_dataset]]
pd.DataFrame(q, index=["train","test","val"]).iloc[:,:5]

***** Running Evaluation *****
  Num examples = 2002
  Batch size = 4


***** Running Evaluation *****
  Num examples = 445
  Batch size = 4
***** Running Evaluation *****
  Num examples = 223
  Batch size = 4


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.009652,0.998002,0.998028,0.998032,0.998027
test,0.022685,0.993258,0.993095,0.992864,0.993412
val,0.10215,0.973094,0.972363,0.972365,0.972461


## Вывод

Подход DistilBert отрабатывает лучше классических методов машинного обучения, практически безошибочно предсказывая класс выборки test. На validation качество хуже, но все равно выше, чем у классических методов. 