In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string

import scipy

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm import tqdm

import datetime

import warnings
warnings.filterwarnings('ignore')

%pylab inline


import nltk
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

Populating the interactive namespace from numpy and matplotlib


[nltk_data] Downloading package punkt to /Users/grigoryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
lenta = pd.read_csv('Lenta_news.csv')

In [7]:
lenta = lenta[['Title', 'Date']]

In [8]:
for x in tqdm(range(len(lenta))):
    lenta.iloc[x].Title = lenta.iloc[x].Title.replace('\xa0', ' ')

100%|██████████| 494726/494726 [01:38<00:00, 5039.88it/s]


In [9]:
lenta['Day of week'] = lenta.Date.apply(lambda x: pd.to_datetime(x).strftime('%A'))

In [10]:
stemmer = SnowballStemmer('russian')

In [11]:
def process_text(text):
    return [stemmer.stem(word) for word in word_tokenize(text.lower()) if word not in string.punctuation]

In [12]:
lenta['text'] = lenta['Title'].apply(process_text)

In [13]:
lenta.iloc[298].text

['twitter', 'оцен', 'в', '11', 'миллиард', 'доллар']

In [14]:
lenta['Date'] = lenta.Date.apply(lambda x: pd.to_datetime(x))

In [15]:
train = lenta[lenta.Date < pd.to_datetime('2021-01-01')]
test = lenta[lenta.Date >= pd.to_datetime('2021-01-01')]

In [16]:
test

Unnamed: 0,Title,Date,Day of week,text
480815,"«Мы молились, чтобы просто выбраться оттуда»В ...",2021-01-01,Friday,"[«, мы, мол, чтоб, прост, выбра, оттуд, », в, ..."
480816,В России вступили в силу новые санитарные треб...,2021-01-01,Friday,"[в, росс, вступ, в, сил, нов, санитарн, требов..."
480817,Опубликовано видео новогоднего салюта в Москве,2021-01-01,Friday,"[опубликова, виде, новогодн, салют, в, москв]"
480818,Лукашенко поздравил белорусов с Новым годом вм...,2021-01-01,Friday,"[лукашенк, поздрав, белорус, с, нов, год, вмес..."
480819,Дед Мороз и Снегурочка попали в ДТП в российск...,2021-01-01,Friday,"[дед, мороз, и, снегурочк, попа, в, дтп, в, ро..."
...,...,...,...,...
494721,ФСБ поймала преступников с миллионами рублей и...,2021-03-19,Friday,"[фсб, пойма, преступник, с, миллион, рубл, и, ..."
494722,Лукашенко раскрыл подробности «обещания» Путин...,2021-03-19,Friday,"[лукашенк, раскр, подробн, «, обещан, », путин..."
494723,Nvidia выпустила видеокарту для майнинга,2021-03-19,Friday,"[nvidia, выпуст, видеокарт, для, майнинг]"
494724,Путин утвердил пятерку нового состава ЦИК,2021-03-19,Friday,"[путин, утверд, пятерк, нов, состав, цик]"


In [17]:
def item_to_str(text):
    s = ''
    for i in tqdm(range(len(text))):
        s = s + ' '
        for j in range(len(text[i])):
            s = s + str(text[i][j]) 
    return s[1:]

In [18]:
tokens = item_to_str(lenta.text.apply(lambda x: ' '.join(x))).split()

100%|██████████| 494726/494726 [02:07<00:00, 3886.03it/s]


In [19]:
transform = TfidfVectorizer(ngram_range=(1,1), use_idf = True)
transform = transform.fit(tokens)
train_tfidf = transform.transform(train['text'].apply(lambda x: ' '.join(x)))

In [20]:
dict_idf = dict(zip(transform.get_feature_names(), transform.idf_))

In [21]:
train['tfidf'] = train_tfidf

In [22]:
for x in tqdm(range(len(train))):
    train['tfidf'].iloc[x] = train_tfidf[x]

100%|██████████| 480815/480815 [35:48<00:00, 223.76it/s] 


In [23]:
exp = train.groupby('Date')['tfidf'].sum()

In [24]:
exp = pd.DataFrame(exp)

In [25]:
exp.iloc[5].tfidf

<1x90754 sparse matrix of type '<class 'numpy.float64'>'
	with 184 stored elements in Compressed Sparse Row format>

In [26]:
exp['Date'] = exp.index
exp

Unnamed: 0_level_0,tfidf,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,"(0, 57635)\t0.47159338921666355\n (0, 71120...",2013-01-01
2013-01-02,"(0, 13142)\t0.37380564905483105\n (0, 28882...",2013-01-02
2013-01-03,"(0, 12150)\t0.30733934375925764\n (0, 22720...",2013-01-03
2013-01-04,"(0, 52098)\t0.4721162891228066\n (0, 52560)...",2013-01-04
2013-01-05,"(0, 24728)\t0.5147234323270747\n (0, 56023)...",2013-01-05
...,...,...
2020-12-27,"(0, 71012)\t0.5258598270359802\n (0, 77767)...",2020-12-27
2020-12-28,"(0, 56463)\t0.34610262970485084\n (0, 62880...",2020-12-28
2020-12-29,"(0, 31958)\t0.3790789512660071\n (0, 56765)...",2020-12-29
2020-12-30,"(0, 14560)\t0.39257895274627275\n (0, 30809...",2020-12-30


In [27]:
exp['Day of week'] = exp.Date.apply(lambda x: pd.to_datetime(x).strftime('%A'))

In [28]:
exp.iloc[9]['Day of week']

'Thursday'

In [29]:
for x in tqdm(range(len(exp))):
    if exp.iloc[x]['Day of week'] == 'Friday':
        if exp.iloc[x + 1]['Day of week'] == 'Saturday' or exp.iloc[x + 1]['Day of week'] == 'Sunday':
            exp.iloc[x]['tfidf'] += exp.iloc[x + 1]['tfidf']
        if exp.iloc[x + 2]['Day of week'] == 'Sunday':
            exp.iloc[x]['tfidf'] += exp.iloc[x + 2]['tfidf']

100%|██████████| 2922/2922 [00:02<00:00, 1173.88it/s]


In [30]:
exp1 = exp[exp['Day of week'] != 'Saturday']
exp1 = exp1[exp1['Day of week'] != 'Sunday']

In [31]:
exp1['tfidf'][220]

<1x90754 sparse matrix of type '<class 'numpy.float64'>'
	with 920 stored elements in Compressed Sparse Row format>

In [32]:
m = pd.read_csv('moex.me.csv', delimiter = ';', error_bad_lines = False)
m = m[m.High.notna()]
m = m[['Date', 'High']]

In [33]:
m['Date'][0]

'04.03.2013'

In [34]:
m['Date'] = m['Date'].apply(lambda x: x[-4:] + '-' + x[-7:-5] + '-' + x[:2])

In [35]:
m['Date'] = pd.to_datetime(m['Date'])

In [36]:
exp1.Date = pd.to_datetime(exp1.Date)

In [37]:
exp1.reset_index(drop = True, inplace = True)

In [38]:
exp_tfidf = exp1.iloc[exp1[exp1['Date'] == pd.to_datetime(m.iloc[0].Date)].index[0] - 1]['tfidf']
for i in range(1, len(m)):
    exp_tfidf = np.append(exp_tfidf, exp1.iloc[exp1[exp1['Date'] == pd.to_datetime(m.iloc[i].Date)].index[0] - 1]['tfidf'])

In [39]:
m['tfidf'] = exp_tfidf
for i in tqdm(range(len(m))):
    m.iloc[i]['tfidf'] = exp_tfidf[i]

100%|██████████| 1923/1923 [00:00<00:00, 3521.08it/s]


In [40]:
score = np.array([m['High']]).reshape(-1, 1)
score

array([[1477.27002 ],
       [1486.72998 ],
       [1498.02002 ],
       ...,
       [3275.679932],
       [3318.389893],
       [3283.320068]])

In [41]:
lr_tfidf = exp_tfidf[0]
for i in range(1, len(exp_tfidf) - 15):
    lr_tfidf = scipy.sparse.vstack([lr_tfidf, exp_tfidf[i]])

In [42]:
lr_tfidf_test = exp_tfidf[-15]
for i in range(1, 15):
    lr_tfidf_test = scipy.sparse.vstack([lr_tfidf_test, exp_tfidf[-15 + i]])

Разные модели:

- LinearRegression - 7,44

- Ridge(1) - 7,439

- Lasso(17) - 6,33

In [89]:
#LR, Ridge(22)
lr = Ridge(1)
lr.fit(lr_tfidf, score[:-15])
y_pred = lr.predict(lr_tfidf_test)

In [90]:
(100 * np.abs(y_pred - score.reshape(-1, 1)[-15:]) / (score.reshape(-1, 1)[-15:])).mean()

7.439370447708981

Lasso показывает куда лучшие резул