In [2]:
import sqlite3
from GoogleNews import GoogleNews
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import time
from string import punctuation, digits
import pymorphy2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_selection import SelectPercentile, SelectFpr, f_classif, chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB, GaussianNB, BernoulliNB

<h1> Making queries and assinging functions </h1>

In [None]:
googlenews = GoogleNews()
queries = ['урал цены', 'свердловская область цены', 'курганская область цены', 'пермский край цены', 
         'оренбургская область цены', "республика башкортостан цены", 'челябинская область цены', 'тюменская область цены',
        'ханты-мансийский автономный округ цены', 'ямало-ненецкий автономный округ цены']

In [None]:
def news_script(queries):
    result_local = []
    result_final = []
    
    for query in queries:
        googlenews.clear()
        googlenews.set_lang('ru')
        googlenews.set_time_range('01/01/2015','09/15/2021')
        googlenews.set_encode('utf-8')
        googlenews.get_news(query)
        googlenews.search(query)


        for i in range(1,15):
            result_local = googlenews.page_at(i)
            for node in result_local:
                node['query'] = query
            result_final += result_local
        time.sleep(600)
    output = pd.DataFrame(result_final)
        
    return output


def save_news(df):
    df.to_csv('news_data.csv')
        

In [3]:
conn = sqlite3.connect('news.db')

In [None]:
df1.to_sql('NEWS', conn)

In [4]:
news = pd.read_sql('SELECT * FROM NEWS', conn)

<h3> Preprocessing data </h3>

In [3]:
def tokenize(news):
    '''Takes a list of news.
       Returns a tokenized list of news list, cleared from a punctuation, numbers and stopwords'''
    
    tokens_news = []
    stop_words = stopwords.words('russian')
    
    for instance in news:
        without_punctuation = ''.join(c for c in instance if c not in punctuation) # clearing from punctuation
        lowered_tokenized_text = word_tokenize(without_punctuation.lower()) # tokenizing text
        # final cleared list of word, which are not in stop_words and is not a number, and is not a specific sign
        final_without_stopwords = [word for word in lowered_tokenized_text 
                                   if (word not in stop_words) and (word not in "«»–—№“”") 
                                   and (not word.isdigit())]
        tokens_news.append(final_without_stopwords)
        
    return tokens_news


def lemmitizing_and_stemming(tokens_list):
    """Input - list of tokens (see tokenize function)
    Return lemmitized (normal form) and stemmed (basis of word) tokens dictionary"""
    
    stemmer = SnowballStemmer('russian')
    lemmitizer = pymorphy2.MorphAnalyzer()
    lemmitizing, stemming = [], []
    
    for token in tokens_list:
        stemming.append([stemmer.stem(word) for word in token])
        lemmitizing.append([lemmitizer.parse(word)[0].normal_form for word in token])
    
    lemm_stem_dict = {'lemmitized': lemmitizing, 'stemmed': stemming}
    
    return lemm_stem_dict
    

def clearing_text(final_list):
    
    descriptions_cleared = []
    for text in final_list:
        combined_back = ' '.join(word for word in text)
        descriptions_cleared.append(combined_back)
    
    return descriptions_cleared


# def vocabulary_tfidf_matrix(cleared_news):
    
#     corpus = cleared_news
#     vectorizer = TfidfVectorizer(smooth_idf=False)
#     X = vectorizer.fit_transform(corpus)
#     vocabulary = vectorizer.get_feature_names()
#     dt = pd.DataFrame(data=X.toarray(), columns=vocabulary)
    
#     return X, vocabulary, dt

<h3> Applying preprocessing, normalization on data and selection of vocabulary </h3>

In [12]:
def preprocessing_data():
    
    test = pd.read_excel('test.xlsx')  # probably preprocessing func will accept a file, frame or smth
    titles, descs, labels = test['title'], test['desc'], test['Type of news']

    final_titles = lemmitizing_and_stemming(tokenize(titles))
    final_descriptions = lemmitizing_and_stemming(tokenize(descs))

    descriptions_cleared = pd.Series(clearing_text(final_descriptions['lemmitized']))
    titles_cleared = pd.Series(clearing_text(final_titles['lemmitized']))

    data = (descriptions_cleared + " " + titles_cleared)
    
    return data, labels


def normalizing_data(train_text, test_text):
    vectorizer = TfidfVectorizer(smooth_idf=False)

    train_text = vectorizer.fit_transform(train_text)
    test_text = vectorizer.transform(test_text)

    vocabulary = vectorizer.get_feature_names()
    dt_train = pd.DataFrame(data=train_text.toarray(), columns=vocabulary)
    dt_test = pd.DataFrame(data=test_text.toarray(), columns=vocabulary)
    
    return dt_train, dt_test


def selecting_vocabulary(dt_train, dt_test, train_labels):
    
    # selector = SelectPercentile(chi2, percentile=10)
    # Y = selector.fit_transform(dt_train, train_labels)

    selector = SelectFpr(f_classif, alpha=0.1)
    Y = selector.fit_transform(dt_train, train_labels)
    
    
    vocabulary = np.asarray(selector.get_support())
    dt_train = dt_train[dt_train.columns[vocabulary]]
    dt_test = dt_test[dt_test.columns[vocabulary]]
    
    return dt_train, dt_test

<h1> Training Model </h1>

In [5]:
test = pd.read_excel('test.xlsx')
test

Unnamed: 0,title,media,date,datetime,desc,Type of news
0,"""Урал"" выпустил мотоцикл по цене нового Toyota...",Российская газета,"Jan 8, 2021",44204.00000,"У ""Урала"" есть ответ: выпустить такую машину, ...",-1
1,В Екатеринбурге открылся центр «Эрмитаж-Урал»....,It's My City,"Jul 2, 2021",44379.00000,"It's My City показывает, как выглядит уральски...",0
2,Ксению Собчак поразили цены на российские мото...,Motonews.ru,"Jan 10, 2021",44206.00000,"""А вы знали, что мотоцикл Урал стоит почти 900...",-1
3,Рост цен на продукты обгоняет доходы. Выгодно ...,АиФ Урал,"Mar 31, 2021",44286.00000,Годовая инфляция на Среднем Урале в феврале 20...,-1
4,"Второй серийный: атомный ледокол ""Урал"" спусти...",Вести.ру,"May 25, 2019",43610.00000,По данным британского издания The Financial Ti...,0
...,...,...,...,...,...,...
363,Шумков попросил ФАС сдержать рост цен на проду...,Правда УрФО,6 days ago,44449.75163,При этом в Курганской области не выросла аренд...,-1
364,В правительстве хотят ограничить цены на масло...,Znak.com,"Dec 10, 2020",44175.00000,В России власти обсуждают с производителями по...,1
365,Свердловскстат сообщил о росте зарплат в Курга...,45.RU,"Feb 2, 2021",44229.00000,"Продукты в области подорожали на 8,2%, больше ...",-1
366,Путин объяснил рост цен на продукты в России,URA.RU,"May 21, 2021",44337.00000,«Сейчас на фоне нестабильной мировой конъюнкту...,-1


In [6]:
data, labels = preprocessing_data()

In [7]:
data

0      урал ответ выпустить такой машина который стои...
1      its my city показывать выглядеть уральский спу...
2      знать мотоцикл урал стоить 900к написать ксени...
3      годовой инфляция среднее урал февраль год срав...
4      данные британский издание the financial times ...
                             ...                        
363    курганский область вырасти арендный плата лесо...
364    россия власть обсуждать производитель подсолне...
365    продукт область подорожать прибавить цена круп...
366    фон нестабильный мировой конъюнктура обострять...
367    курганский область год урожай зерновые оказать...
Length: 368, dtype: object

In [8]:
train_text, test_text, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=101)

In [9]:
dt_train, dt_test = normalizing_data(train_text, test_text)

In [10]:
dt_train

Unnamed: 0,10,12,13литровый,147обсудить,15,17,18дюймовый,20,200летний,2020го,...,эффект,юбилейный,югра,южный,являться,яйцо,январь,яндекс,ярмарка,яромир
0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0.19151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
dt_train, dt_test = selecting_vocabulary(dt_train, dt_test, train_labels)

In [14]:
dt_train

Unnamed: 0,ural,абонемент,автомобиль,антимонопольный,аренда,аукцион,ашан,базовый,балашиха,банк,...,участок,федеральный,фиксировать,характер,ход,цена,шадринск,эксперт,экспортный,являться
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.107434,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.043293,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.052609,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.091146,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.093234,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.047736,0.000000,0.0,0.0,0.0
291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.048252,0.253906,0.0,0.0,0.0
292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.080528,0.000000,0.0,0.0,0.0


In [15]:
t0 = time.time()
#model = CategoricalNB()
model = GaussianNB()
model.fit(dt_train, train_labels)
print(f'\nTraining time: {round(time.time()-t0, 3)}s')

t0 = time.time()
score_train = model.score(dt_train, train_labels)
print(f"Prediction time (train): {round(time.time()-t0, 3)}s")


t0 = time.time()
score_test = model.score(dt_test, test_labels)
print(f'Prediction time (test): {round(time.time()-t0, 3)}s')

Y_pred = model.predict(dt_test)

print('\nTrain set score:', score_train)
print('Test set score:', score_test)


Training time: 0.013s
Prediction time (train): 0.01s
Prediction time (test): 0.008s

Train set score: 0.9115646258503401
Test set score: 0.5


In [16]:
Y_pred

array([-1,  0, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1,  1,  0,
       -1,  0, -1,  1, -1, -1,  1, -1,  0,  1, -1, -1,  0,  0, -1, -1, -1,
       -1,  1, -1, -1,  0, -1, -1,  1,  1, -1, -1,  1, -1, -1,  1, -1,  0,
        1,  0,  1, -1, -1,  1, -1, -1,  1,  0,  0,  0,  1, -1, -1, -1,  0,
       -1,  0,  1,  1, -1, -1], dtype=int64)

In [17]:
np.asarray(test_labels)

array([-1,  0, -1, -1, -1, -1,  0,  1,  1,  0,  1,  0,  1, -1,  1,  1, -1,
        0, -1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  0,  0, -1,  1,  0,
       -1, -1, -1, -1, -1, -1, -1,  1, -1,  0, -1,  1, -1,  1,  1,  1, -1,
       -1, -1,  0,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1,  0,
       -1,  0, -1,  1, -1, -1], dtype=int64)