In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import string
import re

In [2]:
df = pd.read_csv('tested_data.csv') #read csv

In [3]:
y = df['y'] # get y value
x = df.drop('y',axis=1) # get x value

In [4]:
count = CountVectorizer(lowercase=False) # get CountVectorizer
X = count.fit_transform(x['value'], y) # transform X with COuntVectorizer
vect= TfidfTransformer() # get TfidTransformer
X = vect.fit_transform(X) # transform X with TfidTransformer

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3) # split to train and test data

In [6]:
clf = GaussianNB() # get classifier
clf.fit(x_train.toarray(), y_train) # train our model
clf.score(x_test.toarray(), y_test) # score our model

0.8503401360544217

In [7]:
clf.fit(X.toarray(), y) #train our model with all data that we have

GaussianNB(priors=None)

In [8]:
r = requests.get('https://habr.com/company/mobio/blog/414067/') #get request from habr
soup = BeautifulSoup(r.text, 'html5lib')  # get html text from request
doc = {}
doc['text'] = soup.find("div", {"class": "post__text"}).text # get text 

In [9]:
text = doc['text']

In [10]:
stop_word = get_stop_words('russian') # get stop_words for russian language
punctuation = string.punctuation # get punctuation
dob = '»—«qwertyuiopasdfghjklzxcvbnmQWERTYUIOPADFGHJKLZXCVBNMS' #get rid of additional punctuation and english characters
punctuation = ''.join((punctuation, dob)) # add new punctuation to punctuation
stemmer = SnowballStemmer('russian') #get russian stemmer

In [11]:
def preprocessing(dfvalue):
    tokenization = word_tokenize(dfvalue) # tokenize value
    lower = [[word.lower() for word in text.split()] for text in tokenization] #lower all characters
    text_punctuation = [[item.translate(str.maketrans('','', punctuation)) for item in word] for word in lower] # remove punctuation
    number = [[[item.translate(str.maketrans('', '', '1234567890')) for item in z1] for z1 in z2] for z2 in text_punctuation] #remove numbers
    sentence = [[''.join(item) for item in z1] for z1 in number] #convert list to string
    cleaned_text_list = [item for item in sentence if item != ['']] #remove empty values from list
    cleaned_text = [''.join(qwe) for qwe in cleaned_text_list] #convert list to string
    stop_text =  [qwe for qwe in cleaned_text if qwe not in stop_word] #remove stop words
    stemmed = [stemmer.stem(item) for item in stop_text] #reduce inflectional forms 
    notlist_stem = [''.join(qwe) for qwe in stemmed] #convert list to string
    return notlist_stem

In [12]:
tex  = preprocessing(text) # preprocess our text

In [13]:
tex = count.transform(tex) # transform our text with countvectorizer
tex = vect.transform(tex) # transform our text with tfidtransformer

In [15]:
clf.predict(tex.toarray()) # predict our text

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,