### Выгрузка данных

In [29]:
import pandas as pd
import numpy as np
import OpenBlender
import json
token = "your_token"
action = 'API_getObservationsFromDataset'
# ANCHOR: 'Bitcoin vs USD'
  
parameters = { 
    'token' : token,
    'id_dataset' : '5d4c3af79516290b01c83f51',
    'date_filter':{"start_date" : "2020-01-01",
                   "end_date" : "2020-08-29"} 
}
df = pd.read_json(json.dumps(OpenBlender.call(action, parameters)['sample']), convert_dates=False, convert_axes=False).sort_values('timestamp', ascending=False)
df.reset_index(drop=True, inplace=True)
df['date'] = [OpenBlender.unixToDate(ts, timezone = 'GMT') for ts in df.timestamp]
df = df.drop('timestamp', axis = 1)

df['log_diff'] = np.log(df['price']) - np.log(df['open'])
df['target'] = [1 if log_diff > 0 else 0 for log_diff in df['log_diff']]

data_format = '%d-%m-%Y %H:%M:%S'
timezone = 'GMT'
df['u_timestamp'] = OpenBlender.dateToUnix(df['date'], 
                                           date_format = data_format, 
                                           timezone = timezone)
df = df[['date', 'u_timestamp', 'price', 'target']]
df = df.rename(columns={"u_timestamp": "timestamp"})
df.head()

Task ID: '6167ff770895fafb4a9d8d4b'.
Total estimated consumption: 500.02 processing units.
Task confirmed. Starting download..
100.0 % completed.


Unnamed: 0,date,timestamp,price,target
0,28-08-2020 17:00:00,1598634000.0,11545.0,1
1,27-08-2020 17:00:00,1598548000.0,11343.0,0
2,26-08-2020 17:00:00,1598461000.0,11471.0,1
3,25-08-2020 17:00:00,1598375000.0,11341.0,0
4,23-08-2020 17:00:00,1598202000.0,11663.0,0


In [30]:
search_keyword = 'bitcoin'
df = df.sort_values('timestamp').reset_index(drop = True)
print('From : ' + OpenBlender.unixToDate(min(df.timestamp)))
print('Until: ' + OpenBlender.unixToDate(max(df.timestamp)))
search_results = OpenBlender.searchTimeBlends(token, df.timestamp, search_keyword)

From : 01-01-2020 17:00:00
Until: 28-08-2020 17:00:00


In [31]:
# We need to add the 'id_dataset' and the 'feature' name we want.

blend_source = {
                'id_dataset':'5ea2039095162936337156c9',
                'feature' : 'text'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')
df = pd.concat([df, df_blend.loc[:, df_blend.columns != 'timestamp']], axis = 1)
df.head()

Task ID: '6167ff7c0895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%


Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...


In [32]:
df_blend['BITCOIN_NE.text_COUNT_last1days'].mean()

5.677966101694915

In [33]:
from io import StringIO

action = 'API_getObservationsFromDataset'

# ANCHOR: 'CoinJournal Tweet'

        
parameters = { 
    	'token': "your_token",
        'id_user':'6151d8659516292e82e51d85',
        'id_dataset':'5ea20b4e95162936348f141d' 
}
        

df_v2 = pd.read_json(StringIO(json.dumps(OpenBlender.call(action, parameters)['sample'])), convert_dates=False, convert_axes=False).sort_values('timestamp', ascending=False)
df_v2.reset_index(drop=True, inplace=True)
df_v2.head()

Task ID: '6167ff800895fafb4a9d8d4b'.
Total estimated consumption: 500.36 processing units.
Task confirmed. Starting download..
25.0 %
50.0 %
75.0 %
100.0 % completed.


Unnamed: 0,re_tweeter,links,author,text,hashtags,reply_count,associated_tweet,timestamp,mentions,author_id,retweet_count,type,id,favorite_count
0,,[https:\/\/t.co\/WGlxA4Ak7Q],CoinJournal,#Markets #Etoro Where to buy Ripple as XRP gea...,,,,1634135588,,,,Tweet,1448295744524328960,
1,,[https:\/\/t.co\/TFusSmwP1W],CoinJournal,#Markets #eToro Where to buy Maker as MKR sees...,,,,1634134712,,,,Tweet,1448292073669414912,
2,,[https:\/\/t.co\/P2cnZkQ5Rq],CoinJournal,#Markets #eToro Where to buy TRON as TRX sees ...,,,,1634131653,,,,Tweet,1448279240630104064,
3,,[https:\/\/t.co\/efeX5JO8WR],CoinJournal,#Markets #eToro Where to buy Binance Coin as B...,,,,1634131113,,,,Tweet,1448276976213508096,
4,,[https:\/\/t.co\/1L04qPbKtA],CoinJournal,#Markets #Bitcoin US is now the top destinatio...,,,,1634129464,,,,Tweet,1448270060699586560,


### бленд 2 вариант

In [35]:
blend_source = {
                'id_dataset':'5ea20b4e95162936348f141d',
                'feature' : 'text'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend_v2 = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')

Task ID: '6167ffaf0895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%


In [36]:
df = pd.concat([df, df_blend_v2.loc[:, df_blend_v2.columns != 'timestamp']], axis = 1)
df.head()

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0


In [37]:
df_blend_v2['COINJOURNA.text_COUNT_last1days'].mean()

0.5550847457627118

In [38]:
blend_source = {
                'id_dataset':'5ea209c495162936348f13eb',
                'feature' : 'text'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend_v3 = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')
df_blend_v3

Task ID: '6167ffbb0895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%


Unnamed: 0,timestamp,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days
0,1577898000,1,[what are your 2020 predictions for #crypto?]
1,1577984400,0,[]
2,1578070800,2,[people are going to opt for something that is...
3,1578157200,1,[bullish on bitcoin]
4,1578243600,1,[a burger king location in venezuela is now ac...
...,...,...,...
231,1598202000,1,[you gotta have some fortitude to handle the s...
232,1598374800,0,[]
233,1598461200,0,[]
234,1598547600,0,[]


In [39]:
df_blend_v3['CRYPTOCURR.text_COUNT_last1days'].mean()

1.1610169491525424

In [40]:
df = pd.concat([df, df_blend_v3.loc[:, df_blend_v3.columns != 'timestamp']], axis = 1)
df.head()

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,1,[what are your 2020 predictions for #crypto?]
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,0,[]
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,2,[people are going to opt for something that is...
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,1,[bullish on bitcoin]
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,1,[a burger king location in venezuela is now ac...


In [41]:
blend_source = {
                'id_dataset':'5ea20cd09516293f790c119c',
                'feature' : 'text'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend_v4 = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')
df_blend_v4

Task ID: '6167ffc30895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%


Unnamed: 0,timestamp,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days
0,1577898000,2,"[line79, youtube13eamt4 indicator fx ]"
1,1577984400,5,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ..."
2,1578070800,2,"[youtube14eamt4 indicator fx , line83]"
3,1578157200,7,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ..."
4,1578243600,4,"[, line93, macdmt4 fx , line91]"
...,...,...,...
231,1598202000,0,[]
232,1598374800,0,[]
233,1598461200,1,[defi pieformance]
234,1598547600,0,[]


In [42]:
df_blend_v4['CRYPTOCURR.text_COUNT_last1days'].mean()

0.8813559322033898

In [43]:
df = pd.concat([df, df_blend_v4.loc[:, df_blend_v4.columns != 'timestamp']], axis = 1)
df.head()

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days,CRYPTOCURR.text_COUNT_last1days.1,CRYPTOCURR.text_last1days.1
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,1,[what are your 2020 predictions for #crypto?],2,"[line79, youtube13eamt4 indicator fx ]"
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,0,[],5,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ..."
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,2,[people are going to opt for something that is...,2,"[youtube14eamt4 indicator fx , line83]"
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,1,[bullish on bitcoin],7,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ..."
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,1,[a burger king location in venezuela is now ac...,4,"[, line93, macdmt4 fx , line91]"


### тут немного другая штука - количество поисков

In [44]:
blend_source = {
                'id_dataset':'6114e0ed9516295907e7f5d4',
                'feature' : 'bitcoin_search'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend_v5 = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')
df_blend_v5

Task ID: '6167ffd30895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%


Unnamed: 0,BTC_FEARGR.bitcoin_search_COUNT_last1days,timestamp,BTC_FEARGR.bitcoin_search_last1days
0,1,1577898000,[14]
1,1,1577984400,[14]
2,1,1578070800,[14]
3,1,1578157200,[14]
4,1,1578243600,[14]
...,...,...,...
231,1,1598202000,[14]
232,1,1598374800,[15]
233,1,1598461200,[15]
234,1,1598547600,[15]


In [45]:
df_blend_v5 = df_blend_v5.drop(['BTC_FEARGR.bitcoin_search_COUNT_last1days'], axis=1)

In [46]:
df_blend_v5['BTC_FEARGR.bitcoin_search_last1days'] = df_blend_v5['BTC_FEARGR.bitcoin_search_last1days'].apply(lambda x: x[0])

In [47]:
df = pd.concat([df, df_blend_v5.loc[:, df_blend_v5.columns != 'timestamp']], axis = 1)
df.head()

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days,CRYPTOCURR.text_COUNT_last1days.1,CRYPTOCURR.text_last1days.1,BTC_FEARGR.bitcoin_search_last1days
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,1,[what are your 2020 predictions for #crypto?],2,"[line79, youtube13eamt4 indicator fx ]",14
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,0,[],5,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ...",14
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,2,[people are going to opt for something that is...,2,"[youtube14eamt4 indicator fx , line83]",14
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,1,[bullish on bitcoin],7,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ...",14
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,1,[a burger king location in venezuela is now ac...,4,"[, line93, macdmt4 fx , line91]",14


In [48]:
%%time

blend_source = {
                'id_dataset':'5defce899516296bfe37c366',
                'feature' : 'title'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend_v6 = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')
df_blend_v6

Task ID: '6167ffe80895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%
CPU times: user 2.74 s, sys: 13.9 ms, total: 2.76 s
Wall time: 11 s


Unnamed: 0,timestamp,FOX_NEWS_B.title_COUNT_last1days,FOX_NEWS_B.title_last1days
0,1577898000,84,[former nissan chairman flees the country whil...
1,1577984400,76,[why january is the most popular time to divor...
2,1578070800,160,[ncaa athlete pay debate: why a political show...
3,1578157200,92,[state posts staggering sales figures after fi...
4,1578243600,35,[this 2020 billionaire just came one step clos...
...,...,...,...
231,1598202000,0,[]
232,1598374800,0,[]
233,1598461200,16,[brother of suzanne morphew pleads for brother...
234,1598547600,0,[]


In [51]:
%%time

blend_source = {
                'id_dataset':'5defce899516296bfe37c366',
                'feature' : 'headline'
            }

# Now, let's 'timeBlend' it to our dataset
df_blend_v7 = OpenBlender.timeBlend( token = token,
                                  anchor_ts = df.timestamp,
                                  blend_source = blend_source,
                                  blend_type = 'agg_in_intervals',
                                  interval_size = 60 * 60 * 24,
                                  direction = 'time_prior',
                                  interval_output = 'list',
                                  missing_values = 'raw')
df_blend_v7

Task ID: '616800180895fafb4a9d8d4c'.
Total estimated consumption: 1555.2 processing units.
Task confirmed. Starting download..
100%
CPU times: user 2.01 s, sys: 10.6 ms, total: 2.02 s
Wall time: 7.27 s


Unnamed: 0,FOX_NEWS_B.headline_COUNT_last1days,timestamp,FOX_NEWS_B.headline_last1days
0,59,1577898000,[ryan patel of the drucker school of managemen...
1,60,1577984400,[will 2020 bring even more stock market gains?...
2,92,1578070800,"[, cars her way founder lisa copeland discusse..."
3,49,1578157200,"[, the airstrikes were targeted two vehicles c..."
4,27,1578243600,"[, foxnews, 2020 democratic candidate tom stey..."
...,...,...,...
231,0,1598202000,[]
232,0,1598374800,[]
233,13,1598461200,"[, rnc chairwoman ronna mcdaniel on what to ex..."
234,0,1598547600,[]


In [52]:
df = pd.concat([df, df_blend_v6.loc[:, df_blend_v6.columns != 'timestamp']], axis = 1)
df.head()

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days,CRYPTOCURR.text_COUNT_last1days.1,CRYPTOCURR.text_last1days.1,BTC_FEARGR.bitcoin_search_last1days,FOX_NEWS_B.title_COUNT_last1days,FOX_NEWS_B.title_last1days
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,1,[what are your 2020 predictions for #crypto?],2,"[line79, youtube13eamt4 indicator fx ]",14,84,[former nissan chairman flees the country whil...
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,0,[],5,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ...",14,76,[why january is the most popular time to divor...
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,2,[people are going to opt for something that is...,2,"[youtube14eamt4 indicator fx , line83]",14,160,[ncaa athlete pay debate: why a political show...
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,1,[bullish on bitcoin],7,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ...",14,92,[state posts staggering sales figures after fi...
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,1,[a burger king location in venezuela is now ac...,4,"[, line93, macdmt4 fx , line91]",14,35,[this 2020 billionaire just came one step clos...


In [53]:
df = pd.concat([df, df_blend_v7.loc[:, df_blend_v7.columns != 'timestamp']], axis = 1)
df.head()

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days,CRYPTOCURR.text_COUNT_last1days.1,CRYPTOCURR.text_last1days.1,BTC_FEARGR.bitcoin_search_last1days,FOX_NEWS_B.title_COUNT_last1days,FOX_NEWS_B.title_last1days,FOX_NEWS_B.headline_COUNT_last1days,FOX_NEWS_B.headline_last1days
0,01-01-2020 17:00:00,1577898000.0,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,1,[what are your 2020 predictions for #crypto?],2,"[line79, youtube13eamt4 indicator fx ]",14,84,[former nissan chairman flees the country whil...,59,[ryan patel of the drucker school of managemen...
1,02-01-2020 17:00:00,1577984000.0,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,0,[],5,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ...",14,76,[why january is the most popular time to divor...,60,[will 2020 bring even more stock market gains?...
2,03-01-2020 17:00:00,1578071000.0,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,2,[people are going to opt for something that is...,2,"[youtube14eamt4 indicator fx , line83]",14,160,[ncaa athlete pay debate: why a political show...,92,"[, cars her way founder lisa copeland discusse..."
3,04-01-2020 17:00:00,1578157000.0,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,1,[bullish on bitcoin],7,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ...",14,92,[state posts staggering sales figures after fi...,49,"[, the airstrikes were targeted two vehicles c..."
4,05-01-2020 17:00:00,1578244000.0,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,1,[a burger king location in venezuela is now ac...,4,"[, line93, macdmt4 fx , line91]",14,35,[this 2020 billionaire just came one step clos...,27,"[, foxnews, 2020 democratic candidate tom stey..."


### дамп так как тексты неверно записываются в файл

In [54]:
import pickle
with open("full_df_with_7_sources.pickle", "wb") as f:
    pickle.dump(df, f)

In [65]:
with open("full_df_with_7_sources.pickle", "rb") as f:
    df = pickle.load(f)
d

Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,CRYPTOCURR.text_COUNT_last1days,CRYPTOCURR.text_last1days,CRYPTOCURR.text_COUNT_last1days.1,CRYPTOCURR.text_last1days.1,BTC_FEARGR.bitcoin_search_last1days,FOX_NEWS_B.title_COUNT_last1days,FOX_NEWS_B.title_last1days,FOX_NEWS_B.headline_COUNT_last1days,FOX_NEWS_B.headline_last1days
0,01-01-2020 17:00:00,1.577898e+09,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,1,[what are your 2020 predictions for #crypto?],2,"[line79, youtube13eamt4 indicator fx ]",14,84,[former nissan chairman flees the country whil...,59,[ryan patel of the drucker school of managemen...
1,02-01-2020 17:00:00,1.577984e+09,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,0,[],5,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ...",14,76,[why january is the most popular time to divor...,60,[will 2020 bring even more stock market gains?...
2,03-01-2020 17:00:00,1.578071e+09,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,2,[people are going to opt for something that is...,2,"[youtube14eamt4 indicator fx , line83]",14,160,[ncaa athlete pay debate: why a political show...,92,"[, cars her way founder lisa copeland discusse..."
3,04-01-2020 17:00:00,1.578157e+09,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,1,[bullish on bitcoin],7,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ...",14,92,[state posts staggering sales figures after fi...,49,"[, the airstrikes were targeted two vehicles c..."
4,05-01-2020 17:00:00,1.578244e+09,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,1,[a burger king location in venezuela is now ac...,4,"[, line93, macdmt4 fx , line91]",14,35,[this 2020 billionaire just came one step clos...,27,"[, foxnews, 2020 democratic candidate tom stey..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,23-08-2020 17:00:00,1.598202e+09,11663.0,0,0,[],[],0,1,[you gotta have some fortitude to handle the s...,0,[],14,0,[],0,[]
232,25-08-2020 17:00:00,1.598375e+09,11341.0,0,0,[],[xrps active wallets hit 7day high despite 92 ...,1,0,[],0,[],15,0,[],0,[]
233,26-08-2020 17:00:00,1.598461e+09,11471.0,1,0,[],[the irs is sending tax warning letters to cry...,3,0,[],1,[defi pieformance],15,16,[brother of suzanne morphew pleads for brother...,13,"[, rnc chairwoman ronna mcdaniel on what to ex..."
234,27-08-2020 17:00:00,1.598548e+09,11343.0,0,0,[],[brazils central bank considering the developm...,4,0,[],0,[],15,0,[],0,[]


In [57]:
d['BITCOIN_NE.text_last1days'][0]

['etoro ceo yoni assia on reaching 12 million users and why cryptos are a gateway to stocks ',
 'china takes another step away from usd hegemony ',
 'veriblock captured close to 60 of btcs op return transactions in 2019 ']

In [59]:
pd.read_csv("full_df_with_7_sources.csv")['BITCOIN_NE.text_last1days'][0]

"['etoro ceo yoni assia on reaching 12 million users and why cryptos are a gateway to stocks ', 'china takes another step away from usd hegemony ', 'veriblock captured close to 60 of btcs op return transactions in 2019 ']"

### csv 

In [60]:
# df.to_csv("full_df_with_7_sources.csv", index=False)

In [66]:
df.columns

Index(['date', 'timestamp', 'price', 'target',
       'BITCOIN_NE.text_COUNT_last1days', 'BITCOIN_NE.text_last1days',
       'COINJOURNA.text_last1days', 'COINJOURNA.text_COUNT_last1days',
       'CRYPTOCURR.text_COUNT_last1days', 'CRYPTOCURR.text_last1days',
       'CRYPTOCURR.text_COUNT_last1days', 'CRYPTOCURR.text_last1days',
       'BTC_FEARGR.bitcoin_search_last1days',
       'FOX_NEWS_B.title_COUNT_last1days', 'FOX_NEWS_B.title_last1days',
       'FOX_NEWS_B.headline_COUNT_last1days', 'FOX_NEWS_B.headline_last1days'],
      dtype='object')

In [67]:
df['CRYPTOCURR.text_last1days_v1'] = df['CRYPTOCURR.text_last1days'].values[:, 0]
df['CRYPTOCURR.text_last1days_v2'] = df['CRYPTOCURR.text_last1days'].values[:, 1]

In [68]:
df['CRYPTOCURR.text_COUNT_last1days_v1'] = df['CRYPTOCURR.text_COUNT_last1days'].values[:, 0]
df['CRYPTOCURR.text_COUNT_last1days_v2'] = df['CRYPTOCURR.text_COUNT_last1days'].values[:, 1]

del df['CRYPTOCURR.text_last1days']
del df['CRYPTOCURR.text_COUNT_last1days']

In [71]:
count_features = [
    'BITCOIN_NE.text_COUNT_last1days', 
    'COINJOURNA.text_COUNT_last1days', 
    'CRYPTOCURR.text_COUNT_last1days_v1', 
    'CRYPTOCURR.text_COUNT_last1days_v2',
    'BTC_FEARGR.bitcoin_search_last1days', 
    'FOX_NEWS_B.title_COUNT_last1days', 
    'FOX_NEWS_B.headline_COUNT_last1days'
]
text_features = [
    'BITCOIN_NE.text_last1days', 'CRYPTOCURR.text_last1days_v1',
    'CRYPTOCURR.text_last1days_v2', 'FOX_NEWS_B.title_last1days', 
    'FOX_NEWS_B.headline_last1days', 'COINJOURNA.text_last1days'
]

In [72]:
df['BTC_FEARGR.bitcoin_search_last1days'] = df['BTC_FEARGR.bitcoin_search_last1days'].apply(lambda x: int(x))

### видим что у количества корреляция очень слабая

In [73]:
df[['target']+count_features].corr()['target']

target                                 1.000000
BITCOIN_NE.text_COUNT_last1days        0.015667
COINJOURNA.text_COUNT_last1days        0.031513
CRYPTOCURR.text_COUNT_last1days_v1    -0.106928
CRYPTOCURR.text_COUNT_last1days_v2    -0.048718
BTC_FEARGR.bitcoin_search_last1days    0.037524
FOX_NEWS_B.title_COUNT_last1days      -0.080625
FOX_NEWS_B.headline_COUNT_last1days   -0.069817
Name: target, dtype: float64

In [74]:
df['count_sum'] = df[count_features].sum(axis=1)

In [75]:
df[['count_sum', 'target']].corr()['target']

count_sum   -0.076851
target       1.000000
Name: target, dtype: float64

In [77]:
row_texts = list()
for row in df[text_features].iterrows():
    row_text = list()
    for val in row[1].values:
        for v in val:
            row_text.append(v)
    row_texts.append(row_text)

In [78]:
df['text_full'] = row_texts

In [82]:
# df.to_csv("full_df_with_7_sources_and_summary.csv", index=False)

with open("full_df_with_7_sources_and_summary.pickle", "wb") as f:
    pickle.dump(df, f)

### finbert api

In [83]:
import requests
API_TOKEN = "your_token"
API_URL = "https://api-inference.huggingface.co/models/ProsusAI/finbert"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

output = query({"inputs": "I like you. I love you"})
output

[[{'label': 'positive', 'score': 0.08782877773046494},
  {'label': 'negative', 'score': 0.021379711106419563},
  {'label': 'neutral', 'score': 0.8907914757728577}]]

In [135]:
%%time
from tqdm import tqdm
results=list()
for v in tqdm(df['text_full'].values):
    r = query(v)
    results.append(r)

100% 236/236 [15:06<00:00,  3.84s/it] 

CPU times: user 10.1 s, sys: 810 ms, total: 10.9 s
Wall time: 15min 6s





### видим что здесь слабая корреляция, если считать скор сразу по всем

In [141]:
features = ['target', 'res_pos', 'res_neg']
df[features].dropna().corr()['target']

target     1.000000
res_pos    0.036260
res_neg    0.011118
Name: target, dtype: float64

In [None]:
# %%time

# df['res_pos'] = pd.Series()
# df['res_neg'] = pd.Series()
# df['res_neutral'] = pd.Series()
# for i in tqdm(range(236)):
#     print(i)
#     sub_df = df.loc[i, :]#['BITCOIN_NE.text_last1days']
#     text_length = len(sub_df['text_full'])
#     res_pos = 0
#     res_neg = 0
#     res_neu = 0
#     for j in range(text_length):
#         try:
#             current_res = query(sub_df['text_full'][j])[0]
#             current_res_pos = current_res[0]['score']
#             current_res_neg = current_res[1]['score']
#             current_res_neutral = current_res[2]['score']
#             if current_res_pos>=current_res_neg:
#                 if current_res_pos > current_res_neutral:
#                     res_pos += 1
#                 else:
#                     res_neu += 1
#             else:
#                 if current_res_neg > current_res_neutral:
#                     res_neg += 1
#                 else:
#                     res_neu += 1
#         except Exception as e:
#             continue
#     df.loc[i, 'res_pos'] = res_pos
#     df.loc[i, 'res_neg'] = res_neg
#     df.loc[i, 'res_neutral'] = res_neu
# df

### тут и позитив и негатив и нейтрал

In [146]:
features = ['target', 'res_pos', 'res_neg']
df[features].dropna().corr()['target']

target     1.000000
res_pos   -0.009399
res_neg    0.000386
Name: target, dtype: float64

In [148]:
features = ['target', 'res_pos', 'res_neg', 'res_neu']
df[features].dropna().corr()['target']

target     1.000000
res_pos    0.113342
res_neg   -0.078145
res_neu   -0.032336
Name: target, dtype: float64

### только по биткоин новостям и добавляю вероятности по дню с каждой новости

In [84]:
%%time
from tqdm import tqdm

df['res_pos'] = pd.Series()
df['res_neg'] = pd.Series()
df['res_neu'] = pd.Series()
for i in tqdm(range(236)):
    print(i)
    sub_df = df.loc[i, :]#['BITCOIN_NE.text_last1days']
    text_length = len(sub_df['BITCOIN_NE.text_last1days'])
    res_pos = 0
    res_neg = 0
    res_neu = 0
    for j in range(text_length):
        try:
            q = query(sub_df['BITCOIN_NE.text_last1days'][j])
            current_res = q[0]
            current_res_pos = current_res[0]['score']
            current_res_neg = current_res[1]['score']
            current_res_neutral = current_res[2]['score']
            res_pos += current_res_pos
            res_neg += current_res_neg
            res_neu += current_res_neutral
            # for counting
#             if current_res_pos>=current_res_neg:
#                 if current_res_pos > current_res_neutral:
#                     res_pos += 1
#                 else:
#                     res_neu += 1
#             else:
#                 if current_res_neg > current_res_neutral:
#                     res_neg += 1
#                 else:
#                     res_neu += 1
        except Exception as e:
            continue
    df.loc[i, 'res_pos'] = res_pos
    df.loc[i, 'res_neg'] = res_neg
    df.loc[i, 'res_neu'] = res_neu
df

  0% 0/236 [00:00<?, ?it/s]

0


  0% 1/236 [00:02<09:08,  2.33s/it]

1


  1% 2/236 [00:07<15:20,  3.93s/it]

2


  1% 3/236 [00:13<19:34,  5.04s/it]

3


  2% 4/236 [00:18<18:26,  4.77s/it]

4


  2% 5/236 [00:21<17:00,  4.42s/it]

5


  3% 6/236 [00:26<17:05,  4.46s/it]

6


  3% 7/236 [00:32<19:27,  5.10s/it]

7


  3% 8/236 [02:15<2:16:50, 36.01s/it]

8


  4% 9/236 [02:18<1:37:13, 25.70s/it]

9


  4% 10/236 [02:22<1:12:19, 19.20s/it]

10


  5% 11/236 [02:26<53:58, 14.39s/it]  

11


  5% 12/236 [02:29<41:41, 11.17s/it]

12


  6% 13/236 [02:35<35:36,  9.58s/it]

13


  6% 14/236 [02:43<33:18,  9.00s/it]

14


  6% 15/236 [02:47<27:16,  7.40s/it]

15


  7% 16/236 [02:52<24:57,  6.81s/it]

16


  7% 17/236 [02:56<21:46,  5.96s/it]

17


  8% 18/236 [03:01<20:01,  5.51s/it]

18


  8% 19/236 [03:03<16:37,  4.60s/it]

19


  8% 20/236 [03:09<17:35,  4.89s/it]

20


  9% 21/236 [03:16<19:38,  5.48s/it]

21


  9% 22/236 [03:21<19:15,  5.40s/it]

22


 10% 23/236 [03:26<19:06,  5.38s/it]

23


 10% 24/236 [03:32<19:55,  5.64s/it]

24


 11% 25/236 [03:36<17:53,  5.09s/it]

25


 11% 26/236 [03:42<18:41,  5.34s/it]

26


 11% 27/236 [03:45<16:26,  4.72s/it]

27


 12% 28/236 [03:50<16:46,  4.84s/it]

28


 12% 29/236 [03:55<16:22,  4.75s/it]

29


 13% 30/236 [04:02<18:26,  5.37s/it]

30


 13% 31/236 [04:07<18:37,  5.45s/it]

31


 14% 32/236 [04:11<16:43,  4.92s/it]

32


 14% 33/236 [04:16<16:19,  4.83s/it]

33


 14% 34/236 [04:21<16:35,  4.93s/it]

34


 15% 35/236 [04:26<16:51,  5.03s/it]

35


 15% 36/236 [04:34<19:50,  5.95s/it]

36


 16% 37/236 [04:40<19:23,  5.84s/it]

37


 16% 38/236 [04:46<19:06,  5.79s/it]

38


 17% 39/236 [04:49<17:02,  5.19s/it]

39


 17% 40/236 [04:53<15:17,  4.68s/it]

40


 17% 41/236 [04:59<16:19,  5.03s/it]

41


 18% 42/236 [05:06<18:13,  5.64s/it]

42


 18% 43/236 [05:13<19:25,  6.04s/it]

43


 19% 44/236 [05:18<18:26,  5.76s/it]

44


 19% 45/236 [05:22<16:56,  5.32s/it]

45


 19% 46/236 [05:27<16:05,  5.08s/it]

46


 20% 47/236 [05:31<15:10,  4.82s/it]

47


 20% 48/236 [05:38<17:06,  5.46s/it]

48


 21% 49/236 [05:43<16:32,  5.30s/it]

49


 21% 50/236 [05:47<15:24,  4.97s/it]

50


 22% 51/236 [05:50<13:55,  4.52s/it]

51


 22% 52/236 [07:27<1:38:09, 32.01s/it]

52


 22% 53/236 [07:31<1:12:26, 23.75s/it]

53


 23% 54/236 [07:35<54:27, 17.95s/it]  

54


 23% 55/236 [07:39<41:25, 13.73s/it]

55


 24% 56/236 [07:44<32:42, 10.91s/it]

56


 24% 57/236 [07:49<27:37,  9.26s/it]

57


 25% 58/236 [07:54<23:14,  7.83s/it]

58


 25% 59/236 [07:59<21:20,  7.24s/it]

59


 25% 60/236 [08:03<18:05,  6.17s/it]

60


 26% 61/236 [08:08<17:01,  5.84s/it]

61


 26% 62/236 [08:12<14:55,  5.15s/it]

62


 27% 63/236 [08:17<14:33,  5.05s/it]

63


 27% 64/236 [08:21<14:20,  5.00s/it]

64


 28% 65/236 [08:26<13:34,  4.76s/it]

65


 28% 66/236 [08:32<14:42,  5.19s/it]

66


 28% 67/236 [08:36<14:02,  4.99s/it]

67


 29% 68/236 [08:40<12:37,  4.51s/it]

68


 29% 69/236 [08:45<12:50,  4.61s/it]

69


 30% 70/236 [08:48<11:45,  4.25s/it]

70


 30% 71/236 [08:51<10:43,  3.90s/it]

71


 31% 72/236 [08:56<11:22,  4.16s/it]

72


 31% 73/236 [09:01<12:24,  4.57s/it]

73


 31% 74/236 [09:06<12:17,  4.55s/it]

74


 32% 75/236 [09:09<10:46,  4.02s/it]

75


 32% 76/236 [09:14<11:37,  4.36s/it]

76


 33% 77/236 [09:18<11:48,  4.46s/it]

77


 33% 78/236 [09:22<11:12,  4.26s/it]

78


 33% 79/236 [09:27<11:15,  4.30s/it]

79


 34% 80/236 [09:31<11:20,  4.36s/it]

80


 34% 81/236 [09:36<11:17,  4.37s/it]

81


 35% 82/236 [09:40<11:04,  4.31s/it]

82


 35% 83/236 [09:44<11:14,  4.41s/it]

83


 36% 84/236 [09:50<11:56,  4.71s/it]

84


 36% 85/236 [09:56<12:38,  5.02s/it]

85


 36% 86/236 [10:01<12:41,  5.08s/it]

86


 37% 87/236 [10:06<12:34,  5.06s/it]

87


 37% 88/236 [10:12<13:30,  5.48s/it]

88


 38% 89/236 [10:18<13:45,  5.62s/it]

89


 38% 90/236 [10:22<12:42,  5.22s/it]

90


 39% 91/236 [10:26<11:27,  4.74s/it]

91


 39% 92/236 [10:30<10:58,  4.57s/it]

92


 39% 93/236 [10:34<10:10,  4.27s/it]

93


 40% 94/236 [10:37<09:32,  4.03s/it]

94


 40% 95/236 [10:43<10:31,  4.48s/it]

95


 41% 96/236 [10:47<10:21,  4.44s/it]

96


 41% 97/236 [10:50<09:04,  3.92s/it]

97


 42% 98/236 [10:52<07:39,  3.33s/it]

98


 42% 99/236 [10:56<07:51,  3.44s/it]

99


 42% 100/236 [10:57<06:25,  2.84s/it]

100


 43% 101/236 [11:00<06:36,  2.93s/it]

101


 43% 102/236 [11:05<08:05,  3.62s/it]

102


 44% 103/236 [11:10<08:37,  3.89s/it]

103


 44% 104/236 [11:12<07:24,  3.37s/it]

104


 44% 105/236 [11:14<06:37,  3.03s/it]

105


 45% 106/236 [11:18<07:15,  3.35s/it]

106


 45% 107/236 [11:20<06:21,  2.95s/it]

107


 46% 108/236 [11:23<06:15,  2.93s/it]

108


 46% 109/236 [11:26<06:12,  2.93s/it]

109


 47% 110/236 [11:31<07:24,  3.53s/it]

110


 47% 111/236 [11:33<06:32,  3.14s/it]

111


 47% 112/236 [11:37<06:40,  3.23s/it]

112


 48% 113/236 [11:40<06:24,  3.13s/it]

113


 48% 114/236 [11:43<06:45,  3.32s/it]

114


 49% 115/236 [11:48<07:23,  3.66s/it]

115


 49% 116/236 [11:51<06:56,  3.47s/it]

116


 50% 117/236 [11:52<05:41,  2.87s/it]

117


 50% 118/236 [11:55<05:40,  2.89s/it]

118


 50% 119/236 [11:57<04:54,  2.52s/it]

119


 51% 120/236 [12:00<05:14,  2.71s/it]

120


 51% 121/236 [12:03<05:16,  2.76s/it]

121


 52% 122/236 [12:06<05:09,  2.71s/it]

122


 52% 123/236 [12:10<05:51,  3.11s/it]

123


 53% 124/236 [12:14<06:28,  3.47s/it]

124


 53% 125/236 [12:16<05:42,  3.09s/it]

125


 53% 126/236 [12:20<06:07,  3.34s/it]

126


 54% 127/236 [12:25<06:40,  3.68s/it]

127


 54% 128/236 [12:29<06:47,  3.77s/it]

128


 55% 129/236 [12:33<07:03,  3.96s/it]

129


 55% 130/236 [12:38<07:29,  4.24s/it]

130


 56% 131/236 [12:40<06:26,  3.68s/it]

131


 56% 132/236 [12:42<05:37,  3.25s/it]

132


 56% 133/236 [12:45<04:59,  2.90s/it]

133


 57% 134/236 [12:48<05:05,  2.99s/it]

134


 57% 135/236 [12:52<05:35,  3.33s/it]

135


 58% 136/236 [12:55<05:21,  3.22s/it]

136


 58% 137/236 [12:58<05:26,  3.30s/it]

137


 58% 138/236 [13:01<05:09,  3.16s/it]

138


 59% 139/236 [13:04<04:59,  3.09s/it]

139


 59% 140/236 [13:06<04:19,  2.71s/it]

140


 60% 141/236 [13:09<04:24,  2.78s/it]

141


 60% 142/236 [13:14<05:17,  3.38s/it]

142


 61% 143/236 [13:16<04:41,  3.03s/it]

143


 61% 144/236 [13:19<04:54,  3.20s/it]

144


 61% 145/236 [13:23<05:03,  3.33s/it]

145


 62% 146/236 [13:25<04:25,  2.95s/it]

146


 62% 147/236 [13:29<04:58,  3.35s/it]

147


 63% 148/236 [13:34<05:25,  3.69s/it]

148


 63% 149/236 [13:37<05:16,  3.64s/it]

149


 64% 150/236 [13:43<05:51,  4.08s/it]

150


 64% 151/236 [13:46<05:36,  3.96s/it]

151


 64% 152/236 [13:51<05:46,  4.13s/it]

152


 65% 153/236 [13:52<04:36,  3.33s/it]

153


 65% 154/236 [13:55<04:25,  3.24s/it]

154


 66% 155/236 [14:00<04:56,  3.67s/it]

155


 66% 156/236 [14:02<04:14,  3.18s/it]

156


 67% 157/236 [14:07<04:48,  3.66s/it]

157


 67% 158/236 [14:11<04:48,  3.70s/it]

158


 67% 159/236 [14:15<04:59,  3.88s/it]

159


 68% 160/236 [14:17<04:15,  3.36s/it]

160


 68% 161/236 [14:20<04:05,  3.27s/it]

161


 69% 162/236 [14:23<03:52,  3.14s/it]

162


 69% 163/236 [14:28<04:42,  3.87s/it]

163


 69% 164/236 [14:33<04:51,  4.04s/it]

164


 70% 165/236 [14:36<04:36,  3.89s/it]

165


 70% 166/236 [14:40<04:28,  3.83s/it]

166


 71% 167/236 [14:44<04:16,  3.72s/it]

167


 71% 168/236 [14:48<04:27,  3.94s/it]

168


 72% 169/236 [14:53<04:38,  4.16s/it]

169


 72% 170/236 [14:58<04:57,  4.51s/it]

170


 72% 171/236 [15:05<05:44,  5.31s/it]

171


 73% 172/236 [15:13<06:35,  6.17s/it]

172


 73% 173/236 [15:19<06:24,  6.10s/it]

173


 74% 174/236 [15:42<11:18, 10.94s/it]

174


 74% 175/236 [15:46<09:13,  9.07s/it]

175


 75% 176/236 [15:52<08:11,  8.19s/it]

176


 75% 177/236 [15:57<06:55,  7.04s/it]

177


 75% 178/236 [16:03<06:31,  6.74s/it]

178


 76% 179/236 [16:10<06:34,  6.91s/it]

179


 76% 180/236 [16:16<06:01,  6.46s/it]

180


 77% 181/236 [16:20<05:23,  5.88s/it]

181


 77% 182/236 [16:24<04:43,  5.25s/it]

182


 78% 183/236 [16:30<04:49,  5.45s/it]

183


 78% 184/236 [16:35<04:38,  5.36s/it]

184


 78% 185/236 [16:42<04:52,  5.74s/it]

185


 79% 186/236 [16:45<04:18,  5.17s/it]

186


 79% 187/236 [16:50<04:00,  4.91s/it]

187


 80% 188/236 [16:54<03:41,  4.61s/it]

188


 80% 189/236 [17:01<04:20,  5.54s/it]

189


 81% 190/236 [17:10<04:52,  6.35s/it]

190


 81% 191/236 [17:21<05:54,  7.88s/it]

191


 81% 192/236 [17:30<05:57,  8.13s/it]

192


 82% 193/236 [17:34<05:00,  6.99s/it]

193


 82% 194/236 [17:41<04:55,  7.04s/it]

194


 83% 195/236 [17:50<05:04,  7.43s/it]

195


 83% 196/236 [17:56<04:49,  7.24s/it]

196


 83% 197/236 [18:04<04:52,  7.50s/it]

197


 84% 198/236 [18:13<04:57,  7.83s/it]

198


 84% 199/236 [18:23<05:16,  8.55s/it]

199


 85% 200/236 [18:31<05:00,  8.34s/it]

200


 85% 201/236 [18:39<04:48,  8.25s/it]

201


 86% 202/236 [18:43<03:54,  6.89s/it]

202


 86% 203/236 [18:55<04:39,  8.48s/it]

203


 86% 204/236 [19:07<05:08,  9.64s/it]

204


 87% 205/236 [19:14<04:31,  8.74s/it]

205


 87% 206/236 [19:18<03:40,  7.35s/it]

206


 88% 207/236 [19:35<04:56, 10.21s/it]

207


 88% 208/236 [19:46<04:48, 10.31s/it]

208


 89% 209/236 [20:01<05:23, 11.99s/it]

209


 89% 210/236 [20:13<05:09, 11.91s/it]

210


 89% 211/236 [20:21<04:29, 10.78s/it]

211


 90% 212/236 [20:32<04:19, 10.83s/it]

212


 90% 213/236 [20:36<03:21,  8.77s/it]

213


 91% 214/236 [20:42<02:51,  7.78s/it]

214


 91% 215/236 [20:46<02:22,  6.80s/it]

215


 92% 216/236 [20:51<02:02,  6.14s/it]

216


 92% 217/236 [20:56<01:50,  5.80s/it]

217


 92% 218/236 [21:04<01:57,  6.51s/it]

218


 93% 219/236 [21:08<01:39,  5.86s/it]

219


 93% 220/236 [21:12<01:22,  5.18s/it]

220


 94% 221/236 [21:18<01:23,  5.59s/it]

221


 94% 222/236 [21:24<01:19,  5.69s/it]

222


 94% 223/236 [21:30<01:14,  5.75s/it]

223


 95% 224/236 [21:36<01:08,  5.68s/it]

224


100% 236/236 [21:36<00:00,  5.50s/it]

225
226
227
228
229
230
231
232
233
234
235
CPU times: user 42.8 s, sys: 2.34 s, total: 45.1 s
Wall time: 21min 36s





Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,BTC_FEARGR.bitcoin_search_last1days,FOX_NEWS_B.title_COUNT_last1days,...,FOX_NEWS_B.headline_last1days,CRYPTOCURR.text_last1days_v1,CRYPTOCURR.text_last1days_v2,CRYPTOCURR.text_COUNT_last1days_v1,CRYPTOCURR.text_COUNT_last1days_v2,count_sum,text_full,res_pos,res_neg,res_neu
0,01-01-2020 17:00:00,1.577898e+09,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,14,84,...,[ryan patel of the drucker school of managemen...,[what are your 2020 predictions for #crypto?],"[line79, youtube13eamt4 indicator fx ]",1,2,163,[etoro ceo yoni assia on reaching 12 million u...,0.343571,0.631427,2.025002
1,02-01-2020 17:00:00,1.577984e+09,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,14,76,...,[will 2020 bring even more stock market gains?...,[],"[23mt4 ea fx , line81, line82, 24mt4 ea fx ...",0,5,162,[government confirms crypto profits not taxabl...,1.525628,2.151473,3.322899
2,03-01-2020 17:00:00,1.578071e+09,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,14,160,...,"[, cars her way founder lisa copeland discusse...",[people are going to opt for something that is...,"[youtube14eamt4 indicator fx , line83]",2,2,278,[ticker tool uses the bch blockchain to provid...,0.790071,0.639061,6.570868
3,04-01-2020 17:00:00,1.578157e+09,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,14,92,...,"[, the airstrikes were targeted two vehicles c...",[bullish on bitcoin],"[mt4 fx , 19ea, line85, line87, youtube17mt4 ...",1,7,169,[crypto fundraising is changing again in 2020 ...,1.064533,1.109780,3.825687
4,05-01-2020 17:00:00,1.578244e+09,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,14,35,...,"[, foxnews, 2020 democratic candidate tom stey...",[a burger king location in venezuela is now ac...,"[, line93, macdmt4 fx , line91]",1,4,86,[subhash chandra garg on the future of crypto ...,1.266352,0.183858,3.549790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,23-08-2020 17:00:00,1.598202e+09,11663.0,0,0,[],[],0,14,0,...,[],[you gotta have some fortitude to handle the s...,[],1,0,15,[you gotta have some fortitude to handle the s...,0.000000,0.000000,0.000000
232,25-08-2020 17:00:00,1.598375e+09,11341.0,0,0,[],[xrps active wallets hit 7day high despite 92 ...,1,15,0,...,[],[],[],0,0,16,[xrps active wallets hit 7day high despite 92 ...,0.000000,0.000000,0.000000
233,26-08-2020 17:00:00,1.598461e+09,11471.0,1,0,[],[the irs is sending tax warning letters to cry...,3,15,16,...,"[, rnc chairwoman ronna mcdaniel on what to ex...",[],[defi pieformance],0,1,48,"[defi pieformance, brother of suzanne morphew ...",0.000000,0.000000,0.000000
234,27-08-2020 17:00:00,1.598548e+09,11343.0,0,0,[],[brazils central bank considering the developm...,4,15,0,...,[],[],[],0,0,19,[brazils central bank considering the developm...,0.000000,0.000000,0.000000


In [86]:
features = ['target', 'res_pos', 'res_neg', 'res_neu']
df[features].dropna().corr()['target']

target     1.000000
res_pos    0.048596
res_neg   -0.001936
res_neu   -0.052912
Name: target, dtype: float64

### только позитив и негатив с количеством

In [87]:
%%time
from tqdm import tqdm

df['res_pos_count'] = pd.Series()
df['res_neg_count'] = pd.Series()
# df['res_neu'] = pd.Series()
for i in tqdm(range(236)):
    print(i)
    sub_df = df.loc[i, :]#['BITCOIN_NE.text_last1days']
    text_length = len(sub_df['BITCOIN_NE.text_last1days'])
    res_pos = 0
    res_neg = 0
#     res_neu = 0
    for j in range(text_length):
        try:
            q = query(sub_df['BITCOIN_NE.text_last1days'][j])
            current_res = q[0]
            current_res_pos = current_res[0]['score']
            current_res_neg = current_res[1]['score']
#             current_res_neutral = current_res[2]['score']
            # for counting
            if current_res_pos>=current_res_neg:
#                 if current_res_pos > current_res_neutral:
                res_pos += 1
#                 else:
#                 res_neu += 1
            else:
#                 if current_res_neg > current_res_neutral:
                res_neg += 1
#                 else:
#                 res_neu += 1
        except Exception as e:
            continue
    df.loc[i, 'res_pos_count'] = res_pos
    df.loc[i, 'res_neg_count'] = res_neg
#     df.loc[i, 'res_neu'] = res_neu
df

  0% 0/236 [00:00<?, ?it/s]

0


  0% 1/236 [00:02<08:50,  2.26s/it]

1


  1% 2/236 [00:07<15:27,  3.96s/it]

2


  1% 3/236 [00:13<18:50,  4.85s/it]

3


  2% 4/236 [00:18<18:36,  4.81s/it]

4


  2% 5/236 [00:22<18:25,  4.79s/it]

5


  3% 6/236 [00:27<17:53,  4.67s/it]

6


  3% 7/236 [00:34<20:36,  5.40s/it]

7


  3% 8/236 [00:39<20:05,  5.29s/it]

8


  4% 9/236 [00:43<18:15,  4.83s/it]

9


  4% 10/236 [00:49<19:42,  5.23s/it]

10


  5% 11/236 [00:53<18:05,  4.82s/it]

11


  5% 12/236 [00:56<16:52,  4.52s/it]

12


  6% 13/236 [01:01<17:05,  4.60s/it]

13


  6% 14/236 [01:07<18:30,  5.00s/it]

14


  6% 15/236 [01:11<16:49,  4.57s/it]

15


  7% 16/236 [01:17<18:44,  5.11s/it]

16


  7% 17/236 [01:21<17:12,  4.71s/it]

17


  8% 18/236 [01:25<16:33,  4.56s/it]

18


  8% 19/236 [01:28<14:47,  4.09s/it]

19


  8% 20/236 [01:34<16:50,  4.68s/it]

20


  9% 21/236 [01:40<18:13,  5.08s/it]

21


  9% 22/236 [01:44<17:22,  4.87s/it]

22


 10% 23/236 [01:50<17:53,  5.04s/it]

23


 10% 24/236 [01:57<19:39,  5.56s/it]

24


 11% 25/236 [02:00<17:19,  4.93s/it]

25


 11% 26/236 [02:07<18:51,  5.39s/it]

26


 11% 27/236 [02:10<16:59,  4.88s/it]

27


 12% 28/236 [02:17<18:56,  5.46s/it]

28


 12% 29/236 [02:22<17:53,  5.19s/it]

29


 13% 30/236 [02:28<19:04,  5.55s/it]

30


 13% 31/236 [02:33<18:27,  5.40s/it]

31


 14% 32/236 [02:36<16:18,  4.80s/it]

32


 14% 33/236 [02:40<15:24,  4.56s/it]

33


 14% 34/236 [02:46<15:59,  4.75s/it]

34


 15% 35/236 [02:51<16:20,  4.88s/it]

35


 15% 36/236 [02:59<19:38,  5.89s/it]

36


 16% 37/236 [03:04<18:03,  5.45s/it]

37


 16% 38/236 [03:09<18:21,  5.56s/it]

38


 17% 39/236 [03:13<16:33,  5.04s/it]

39


 17% 40/236 [03:17<15:18,  4.68s/it]

40


 17% 41/236 [03:23<16:11,  4.98s/it]

41


 18% 42/236 [03:29<16:55,  5.24s/it]

42


 18% 43/236 [03:36<19:21,  6.02s/it]

43


 19% 44/236 [03:42<18:29,  5.78s/it]

44


 19% 45/236 [03:46<17:11,  5.40s/it]

45


 19% 46/236 [03:50<15:34,  4.92s/it]

46


 20% 47/236 [03:54<14:37,  4.64s/it]

47


 20% 48/236 [03:59<14:31,  4.64s/it]

48


 21% 49/236 [04:03<14:10,  4.55s/it]

49


 21% 50/236 [04:07<13:21,  4.31s/it]

50


 22% 51/236 [04:09<11:46,  3.82s/it]

51


 22% 52/236 [04:13<11:17,  3.68s/it]

52


 22% 53/236 [04:16<11:10,  3.66s/it]

53


 23% 54/236 [04:20<11:22,  3.75s/it]

54


 23% 55/236 [04:25<11:47,  3.91s/it]

55


 24% 56/236 [04:29<11:56,  3.98s/it]

56


 24% 57/236 [04:34<13:01,  4.37s/it]

57


 25% 58/236 [04:38<13:04,  4.41s/it]

58


 25% 59/236 [04:44<14:05,  4.78s/it]

59


 25% 60/236 [04:48<12:57,  4.42s/it]

60


 26% 61/236 [04:53<13:19,  4.57s/it]

61


 26% 62/236 [04:56<12:39,  4.37s/it]

62


 27% 63/236 [05:01<13:08,  4.56s/it]

63


 27% 64/236 [05:07<13:33,  4.73s/it]

64


 28% 65/236 [05:11<12:50,  4.51s/it]

65


 28% 66/236 [05:17<14:34,  5.15s/it]

66


 28% 67/236 [05:22<14:05,  5.00s/it]

67


 29% 68/236 [05:25<12:45,  4.56s/it]

68


 29% 69/236 [05:31<13:22,  4.81s/it]

69


 30% 70/236 [05:34<12:15,  4.43s/it]

70


 30% 71/236 [05:37<10:58,  3.99s/it]

71


 31% 72/236 [05:42<11:14,  4.12s/it]

72


 31% 73/236 [05:47<12:03,  4.44s/it]

73


 31% 74/236 [05:51<11:43,  4.34s/it]

74


 32% 75/236 [05:54<10:18,  3.84s/it]

75


 32% 76/236 [05:59<11:15,  4.22s/it]

76


 33% 77/236 [06:03<11:21,  4.28s/it]

77


 33% 78/236 [06:08<11:26,  4.34s/it]

78


 33% 79/236 [06:12<11:34,  4.42s/it]

79


 34% 80/236 [06:17<11:33,  4.45s/it]

80


 34% 81/236 [06:22<11:41,  4.53s/it]

81


 35% 82/236 [06:26<11:36,  4.52s/it]

82


 35% 83/236 [06:31<11:59,  4.70s/it]

83


 36% 84/236 [06:37<13:00,  5.13s/it]

84


 36% 85/236 [06:44<14:22,  5.71s/it]

85


 36% 86/236 [06:50<13:53,  5.55s/it]

86


 37% 87/236 [06:55<13:40,  5.51s/it]

87


 37% 88/236 [07:00<13:07,  5.32s/it]

88


 38% 89/236 [07:06<13:27,  5.50s/it]

89


 38% 90/236 [07:10<12:36,  5.18s/it]

90


 39% 91/236 [07:14<11:31,  4.77s/it]

91


 39% 92/236 [07:19<11:16,  4.70s/it]

92


 39% 93/236 [07:22<10:26,  4.38s/it]

93


 40% 94/236 [07:26<09:48,  4.14s/it]

94


 40% 95/236 [07:33<11:34,  4.93s/it]

95


 41% 96/236 [07:39<12:16,  5.26s/it]

96


 41% 97/236 [07:42<10:47,  4.66s/it]

97


 42% 98/236 [07:44<08:57,  3.90s/it]

98


 42% 99/236 [07:47<08:39,  3.79s/it]

99


 42% 100/236 [07:49<07:03,  3.11s/it]

100


 43% 101/236 [07:52<06:58,  3.10s/it]

101


 43% 102/236 [07:57<08:24,  3.77s/it]

102


 44% 103/236 [08:02<08:42,  3.93s/it]

103


 44% 104/236 [08:04<07:28,  3.40s/it]

104


 44% 105/236 [08:06<06:39,  3.05s/it]

105


 45% 106/236 [08:11<07:37,  3.52s/it]

106


 45% 107/236 [08:13<06:44,  3.14s/it]

107


 46% 108/236 [08:16<06:42,  3.15s/it]

108


 46% 109/236 [08:19<06:37,  3.13s/it]

109


 47% 110/236 [08:25<08:00,  3.82s/it]

110


 47% 111/236 [08:27<06:50,  3.29s/it]

111


 47% 112/236 [08:30<06:38,  3.21s/it]

112


 48% 113/236 [08:33<06:19,  3.09s/it]

113


 48% 114/236 [08:37<06:58,  3.43s/it]

114


 49% 115/236 [08:41<07:21,  3.65s/it]

115


 49% 116/236 [08:44<06:57,  3.48s/it]

116


 50% 117/236 [08:46<05:48,  2.93s/it]

117


 50% 118/236 [08:48<05:33,  2.82s/it]

118


 50% 119/236 [08:51<05:13,  2.68s/it]

119


 51% 120/236 [08:54<05:43,  2.96s/it]

120


 51% 121/236 [08:58<06:07,  3.20s/it]

121


 52% 122/236 [09:01<05:53,  3.10s/it]

122


 52% 123/236 [09:05<06:24,  3.40s/it]

123


 53% 124/236 [09:09<06:54,  3.70s/it]

124


 53% 125/236 [09:12<06:02,  3.26s/it]

125


 53% 126/236 [09:15<06:16,  3.42s/it]

126


 54% 127/236 [09:20<06:45,  3.72s/it]

127


 54% 128/236 [09:24<07:06,  3.95s/it]

128


 55% 129/236 [09:30<07:53,  4.43s/it]

129


 55% 130/236 [09:36<08:53,  5.03s/it]

130


 56% 131/236 [09:39<07:43,  4.41s/it]

131


 56% 132/236 [09:41<06:30,  3.76s/it]

132


 56% 133/236 [09:44<05:41,  3.31s/it]

133


 57% 134/236 [09:48<05:52,  3.46s/it]

134


 57% 135/236 [09:52<06:27,  3.83s/it]

135


 58% 136/236 [09:55<05:54,  3.54s/it]

136


 58% 137/236 [09:58<05:31,  3.34s/it]

137


 58% 138/236 [10:01<05:07,  3.14s/it]

138


 59% 139/236 [10:03<04:38,  2.88s/it]

139


 59% 140/236 [10:05<04:08,  2.59s/it]

140


 60% 141/236 [10:08<04:16,  2.70s/it]

141


 60% 142/236 [10:13<05:13,  3.34s/it]

142


 61% 143/236 [10:16<05:00,  3.24s/it]

143


 61% 144/236 [10:19<05:13,  3.41s/it]

144


 61% 145/236 [10:23<05:06,  3.37s/it]

145


 62% 146/236 [10:25<04:29,  2.99s/it]

146


 62% 147/236 [10:29<04:49,  3.26s/it]

147


 63% 148/236 [10:32<04:57,  3.38s/it]

148


 63% 149/236 [10:36<05:02,  3.48s/it]

149


 64% 150/236 [10:40<05:22,  3.76s/it]

150


 64% 151/236 [10:44<05:03,  3.57s/it]

151


 64% 152/236 [10:48<05:21,  3.83s/it]

152


 65% 153/236 [10:50<04:23,  3.17s/it]

153


 65% 154/236 [10:53<04:14,  3.10s/it]

154


 66% 155/236 [10:57<04:47,  3.54s/it]

155


 66% 156/236 [10:59<04:09,  3.12s/it]

156


 67% 157/236 [11:04<04:48,  3.65s/it]

157


 67% 158/236 [11:08<04:46,  3.68s/it]

158


 67% 159/236 [11:12<04:53,  3.81s/it]

159


 68% 160/236 [11:14<04:08,  3.27s/it]

160


 68% 161/236 [11:17<04:03,  3.24s/it]

161


 69% 162/236 [11:20<03:55,  3.18s/it]

162


 69% 163/236 [11:25<04:30,  3.70s/it]

163


 69% 164/236 [11:29<04:33,  3.80s/it]

164


 70% 165/236 [11:33<04:30,  3.82s/it]

165


 70% 166/236 [11:37<04:24,  3.78s/it]

166


 71% 167/236 [11:40<04:11,  3.65s/it]

167


 71% 168/236 [11:44<04:17,  3.79s/it]

168


 72% 169/236 [11:49<04:26,  3.97s/it]

169


 72% 170/236 [11:53<04:33,  4.14s/it]

170


 72% 171/236 [11:58<04:35,  4.24s/it]

171


 73% 172/236 [12:04<05:08,  4.81s/it]

172


 73% 173/236 [12:07<04:41,  4.48s/it]

173


 74% 174/236 [12:13<04:53,  4.73s/it]

174


 74% 175/236 [12:16<04:27,  4.39s/it]

175


 75% 176/236 [12:21<04:21,  4.36s/it]

176


 75% 177/236 [12:24<04:05,  4.16s/it]

177


 75% 178/236 [12:30<04:19,  4.48s/it]

178


 76% 179/236 [12:34<04:17,  4.52s/it]

179


 76% 180/236 [12:38<04:04,  4.36s/it]

180


 77% 181/236 [12:41<03:33,  3.89s/it]

181


 77% 182/236 [12:44<03:10,  3.53s/it]

182


 78% 183/236 [12:49<03:31,  3.99s/it]

183


 78% 184/236 [12:53<03:36,  4.17s/it]

184


 78% 185/236 [12:59<03:49,  4.49s/it]

185


 79% 186/236 [13:02<03:27,  4.16s/it]

186


 79% 187/236 [13:06<03:16,  4.00s/it]

187


 80% 188/236 [13:09<03:02,  3.80s/it]

188


 80% 189/236 [13:15<03:34,  4.56s/it]

189


 81% 190/236 [13:21<03:41,  4.82s/it]

190


 81% 191/236 [13:28<04:10,  5.57s/it]

191


 81% 192/236 [13:34<04:06,  5.61s/it]

192


 82% 193/236 [13:37<03:36,  5.03s/it]

193


 82% 194/236 [13:42<03:20,  4.78s/it]

194


 83% 195/236 [13:46<03:16,  4.80s/it]

195


 83% 196/236 [13:52<03:23,  5.08s/it]

196


 83% 197/236 [13:58<03:29,  5.36s/it]

197


 84% 198/236 [14:05<03:44,  5.91s/it]

198


 84% 199/236 [14:11<03:31,  5.72s/it]

199


 85% 200/236 [14:15<03:11,  5.33s/it]

200


 85% 201/236 [14:18<02:41,  4.62s/it]

201


 86% 202/236 [14:22<02:28,  4.36s/it]

202


 86% 203/236 [14:30<03:02,  5.54s/it]

203


 86% 204/236 [14:35<02:52,  5.40s/it]

204


 87% 205/236 [14:39<02:36,  5.04s/it]

205


 87% 206/236 [14:43<02:18,  4.63s/it]

206


 88% 207/236 [14:49<02:24,  4.97s/it]

207


 88% 208/236 [14:54<02:23,  5.13s/it]

208


 89% 209/236 [14:59<02:11,  4.88s/it]

209


 89% 210/236 [15:05<02:18,  5.33s/it]

210


 89% 211/236 [15:12<02:29,  5.96s/it]

211


 90% 212/236 [15:20<02:36,  6.52s/it]

212


 90% 213/236 [15:23<02:07,  5.53s/it]

213


 91% 214/236 [15:28<01:54,  5.23s/it]

214


 91% 215/236 [15:32<01:44,  5.00s/it]

215


 92% 216/236 [15:37<01:36,  4.81s/it]

216


 92% 217/236 [15:41<01:30,  4.77s/it]

217


 92% 218/236 [15:48<01:33,  5.22s/it]

218


 93% 219/236 [15:52<01:25,  5.04s/it]

219


 93% 220/236 [15:56<01:13,  4.58s/it]

220


 94% 221/236 [16:02<01:17,  5.17s/it]

221


 94% 222/236 [16:08<01:13,  5.27s/it]

222


 94% 223/236 [16:14<01:09,  5.38s/it]

223


 95% 224/236 [16:19<01:06,  5.52s/it]

224


100% 236/236 [16:20<00:00,  4.16s/it]

225
226
227
228
229
230
231
232
233
234
235
CPU times: user 45.5 s, sys: 2.56 s, total: 48 s
Wall time: 16min 20s





Unnamed: 0,date,timestamp,price,target,BITCOIN_NE.text_COUNT_last1days,BITCOIN_NE.text_last1days,COINJOURNA.text_last1days,COINJOURNA.text_COUNT_last1days,BTC_FEARGR.bitcoin_search_last1days,FOX_NEWS_B.title_COUNT_last1days,...,CRYPTOCURR.text_last1days_v2,CRYPTOCURR.text_COUNT_last1days_v1,CRYPTOCURR.text_COUNT_last1days_v2,count_sum,text_full,res_pos,res_neg,res_neu,res_pos_count,res_neg_count
0,01-01-2020 17:00:00,1.577898e+09,7212.7,1,3,[etoro ceo yoni assia on reaching 12 million u...,[],0,14,84,...,"[line79, youtube13eamt4 indicator fx ]",1,2,163,[etoro ceo yoni assia on reaching 12 million u...,0.343571,0.631427,2.025002,2.0,1.0
1,02-01-2020 17:00:00,1.577984e+09,6989.4,0,7,[government confirms crypto profits not taxabl...,[],0,14,76,...,"[23mt4 ea fx , line81, line82, 24mt4 ea fx ...",0,5,162,[government confirms crypto profits not taxabl...,1.525628,2.151473,3.322899,2.0,5.0
2,03-01-2020 17:00:00,1.578071e+09,7367.5,1,8,[ticker tool uses the bch blockchain to provid...,[],0,14,160,...,"[youtube14eamt4 indicator fx , line83]",2,2,278,[ticker tool uses the bch blockchain to provid...,0.790071,0.639061,6.570868,5.0,3.0
3,04-01-2020 17:00:00,1.578157e+09,7369.8,0,6,[crypto fundraising is changing again in 2020 ...,[],0,14,92,...,"[mt4 fx , 19ea, line85, line87, youtube17mt4 ...",1,7,169,[crypto fundraising is changing again in 2020 ...,1.064533,1.109780,3.825687,5.0,1.0
4,05-01-2020 17:00:00,1.578244e+09,7364.6,0,5,[subhash chandra garg on the future of crypto ...,[],0,14,35,...,"[, line93, macdmt4 fx , line91]",1,4,86,[subhash chandra garg on the future of crypto ...,1.266352,0.183858,3.549790,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,23-08-2020 17:00:00,1.598202e+09,11663.0,0,0,[],[],0,14,0,...,[],1,0,15,[you gotta have some fortitude to handle the s...,0.000000,0.000000,0.000000,0.0,0.0
232,25-08-2020 17:00:00,1.598375e+09,11341.0,0,0,[],[xrps active wallets hit 7day high despite 92 ...,1,15,0,...,[],0,0,16,[xrps active wallets hit 7day high despite 92 ...,0.000000,0.000000,0.000000,0.0,0.0
233,26-08-2020 17:00:00,1.598461e+09,11471.0,1,0,[],[the irs is sending tax warning letters to cry...,3,15,16,...,[defi pieformance],0,1,48,"[defi pieformance, brother of suzanne morphew ...",0.000000,0.000000,0.000000,0.0,0.0
234,27-08-2020 17:00:00,1.598548e+09,11343.0,0,0,[],[brazils central bank considering the developm...,4,15,0,...,[],0,0,19,[brazils central bank considering the developm...,0.000000,0.000000,0.000000,0.0,0.0


In [88]:
features = ['target', 'res_pos_count', 'res_neg_count']
df[features].dropna().corr()['target']

target           1.000000
res_pos_count   -0.019491
res_neg_count   -0.014140
Name: target, dtype: float64

In [96]:
df[features][(df['res_pos_count']!=0)&(df['res_neg_count']!=0)].corr()['target']

target           1.000000
res_pos_count    0.005539
res_neg_count    0.015031
Name: target, dtype: float64

### full(уберу нейтральные ввиду того, что они не имеют смысла, как правило новость всё равно будет нейтральная)

In [97]:
%%time
from tqdm import tqdm

df['res_pos_full_count'] = pd.Series()
df['res_neg_full_count'] = pd.Series()
df['res_pos_full_prob'] = pd.Series()
df['res_neg_full_prob'] = pd.Series()
# df['res_neu_full_count'] = pd.Series()
for i in tqdm(range(236)):
    print(i)
    sub_df = df.loc[i, :]#['BITCOIN_NE.text_last1days']
    text_length = len(sub_df['text_full'])
    res_pos = 0
    res_pos_p = 0
    res_neg = 0
    res_neg_p = 0
    for j in range(text_length):
        try:
            q = query(sub_df['text_full'][j])
            current_res = q[0]
            current_res_pos = current_res[0]['score']
            current_res_neg = current_res[1]['score']
            if current_res_pos>=current_res_neg:
                res_pos += 1
                res_pos_p += current_res_pos
            else:
                res_neg += 1
                res_neg_p += current_res_neg
        except Exception as e:
            continue
    df.loc[i, 'res_pos_full_count'] = res_pos
    df.loc[i, 'res_neg_full_count'] = res_neg
    df.loc[i, 'res_pos_full_prob'] = res_pos_p
    df.loc[i, 'res_neg_full_prob'] = res_neg_p
df

  0% 0/236 [00:00<?, ?it/s]

0


  0% 1/236 [01:54<7:29:35, 114.79s/it]

1


  1% 2/236 [03:40<7:05:53, 109.20s/it]

2


  1% 3/236 [06:56<9:38:22, 148.94s/it]

3


  2% 4/236 [08:50<8:43:36, 135.42s/it]

4


  2% 5/236 [09:44<6:47:08, 105.75s/it]

5


  3% 6/236 [11:42<7:01:17, 109.90s/it]

6


  3% 7/236 [14:55<8:44:15, 137.36s/it]

7


  3% 8/236 [18:34<10:20:36, 163.32s/it]

8


  4% 9/236 [22:41<11:56:21, 189.35s/it]

9


  4% 10/236 [26:18<12:24:52, 197.75s/it]

10


  5% 11/236 [29:07<9:55:53, 158.90s/it] 

11





KeyboardInterrupt: 

In [98]:
features = ['target', 'res_pos_full_count', 'res_neg_full_count', "res_pos_full_prob", "res_neg_full_prob"]
df[features].dropna().corr()['target']

target                1.000000
res_pos_full_count    0.231061
res_neg_full_count    0.064298
res_pos_full_prob     0.277992
res_neg_full_prob     0.055759
Name: target, dtype: float64