# Loading data

In [1]:
import pandas as pd
df = pd.read_csv('.../TrainTestTOT.csv', delimiter=',', skiprows=0, lineterminator='\n', low_memory=False)
df

Unnamed: 0.1,Unnamed: 0,date,user,timestamp,clean,pos,neg,neu,compound,value,week\r
0,0,2019-05-10,IdeasPrime,2019-05-10 21:42:01+00,btc usd btcusd btc usd btc fibs long short wcx,0.0000,0.000,1.000,0.000,bull\r\r,19
1,4,2019-05-10,harbington,2019-05-10 13:20:10+00,eth also possible ethereum pump time bitcoin d...,0.5859,0.081,0.704,0.215,bull\r\r,19
2,5,2019-05-12,ojo_azul,2019-05-12 15:37:12+00,weenzee invest arbitrage bitcoin risk antihack...,-0.2732,0.110,0.890,0.000,bull\r\r,19
3,7,2019-05-12,crypto_pump_1,2019-05-12 19:50:41+00,binance update fun aion rlc sc join telegram c...,0.6705,0.000,0.744,0.256,bull\r\r,19
4,9,2019-05-10,DavidWa20090980,2019-05-10 18:09:06+00,could bitcoin pin looking,0.0000,0.000,1.000,0.000,bull\r\r,19
...,...,...,...,...,...,...,...,...,...,...,...
599995,59995,2019-11-03,fairfieldcurren,2019-11-03 05:21:52+00,btc capital management inc million holdings ma...,0.0000,0.000,1.000,0.000,bear\r\r,44
599996,59996,2019-11-03,leboncoincrypto,2019-11-03 11:04:17+00,blockchain crypto bitcoin ethereum trading fir...,-0.7184,0.316,0.684,0.000,bear\r\r,44
599997,59997,2019-11-03,ao_coin,2019-11-03 05:00:27+00,algorithm mtlusdt desable trade btc volatiliy,0.0000,0.000,1.000,0.000,bear\r\r,44
599998,59998,2019-11-03,theautomatski,2019-11-03 15:35:16+00,investments wavesplatform cybersecurity detroi...,0.5423,0.000,0.877,0.123,bear\r\r,44


# Inizio di doc2vec

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#tokenize e tag il testo dei tweets
tweet_document = [TaggedDocument(doc.split(' '), [i]) 
             for i, doc in enumerate(df.clean)]
#visualizza i tweet taggati
tweet_document[:2]



[TaggedDocument(words=['btc', 'usd', 'btcusd', 'btc', 'usd', 'btc', 'fibs', 'long', 'short', 'wcx'], tags=[0]),
 TaggedDocument(words=['eth', 'also', 'possible', 'ethereum', 'pump', 'time', 'bitcoin', 'drops', 'saying', 'going', 'happen', 'looking', 'eth', 'btc', 'chart', 'definitely', 'cards', 'kind', 'tension', 'field', 'right', 'btc', 'crypto'], tags=[1])]

# Modello

In [3]:
#instanzia il modello
model = Doc2Vec(vector_size=64, window=2, min_count=1, workers=8, epochs = 10)
#crea il vocabolario
model.build_vocab(tweet_document)
#addestra il modello
model.train(tweet_document, total_examples=model.corpus_count, epochs=model.epochs)

In [4]:
model.wv.key_to_index 

{'bitcoin': 0,
 'btc': 1,
 'crypto': 2,
 'cryptocurrency': 3,
 'blockchain': 4,
 '’': 5,
 'eth': 6,
 'price': 7,
 'ethereum': 8,
 'xrp': 9,
 'buy': 10,
 'market': 11,
 'money': 12,
 'trading': 13,
 'k': 14,
 'time': 15,
 'like': 16,
 'get': 17,
 'news': 18,
 'new': 19,
 'amp': 20,
 'one': 21,
 'via': 22,
 'ltc': 23,
 '️': 24,
 'people': 25,
 'usd': 26,
 'binance': 27,
 'litecoin': 28,
 'gt': 29,
 'top': 30,
 'would': 31,
 'see': 32,
 'free': 33,
 'gold': 34,
 'good': 35,
 'jobs': 36,
 'us': 37,
 'hiring': 38,
 'careers': 39,
 'cryptocurrencies': 40,
 'think': 41,
 'last': 42,
 'exchange': 43,
 'go': 44,
 'long': 45,
 'still': 46,
 'going': 47,
 'use': 48,
 'coin': 49,
 'bch': 50,
 'ripple': 51,
 'trade': 52,
 'value': 53,
 'make': 54,
 'know': 55,
 'info': 56,
 'mining': 57,
 'back': 58,
 'ico': 59,
 'trx': 60,
 'today': 61,
 '🚀': 62,
 'could': 63,
 'world': 64,
 'twitter': 65,
 'next': 66,
 'first': 67,
 'h': 68,
 'day': 69,
 'altcoins': 70,
 'altcoin': 71,
 '“': 72,
 'investment': 73

In [5]:
model.wv.most_similar("btc")

[('bitcoin', 0.9114128947257996),
 ('btc\r', 0.8768987655639648),
 ('xrp', 0.871158242225647),
 ('crypto', 0.84275883436203),
 ('ltc', 0.8397266864776611),
 ('eth', 0.8348677754402161),
 ('today', 0.8314163684844971),
 ('think', 0.829985499382019),
 ('still', 0.8124943971633911),
 ('usd', 0.8081998825073242)]

In [6]:
#genera i vettori
tweet2vec = [model.infer_vector((df['clean'][i].split(' '))) 
            for i in range(0,len(df['clean']))]
tweet2vec[:2]

[array([ 0.08962892, -0.01478323,  0.05245907,  0.05461497, -0.0188525 ,
        -0.04777892,  0.0092875 ,  0.01618084, -0.10712815, -0.09093402,
         0.03352208, -0.07498326, -0.06781733,  0.00666242, -0.02457455,
         0.03793296, -0.11685763, -0.05576555, -0.05398546,  0.03973123,
         0.0720048 ,  0.008313  ,  0.05347347, -0.13134374, -0.00815444,
         0.04633424, -0.04227548, -0.11533497,  0.00181395,  0.00830524,
        -0.01065396,  0.04598576, -0.11587417, -0.07234548,  0.01375864,
         0.04261308,  0.02706897, -0.05922289,  0.07462559, -0.0369884 ,
        -0.03731005, -0.03903158, -0.04075321,  0.02992823,  0.02780966,
        -0.04794156, -0.04341198, -0.0423797 , -0.01117093,  0.05919836,
         0.13711953, -0.04248716,  0.05999739,  0.02866414,  0.07604454,
         0.06306117,  0.04247242, -0.01627171, -0.00887814, -0.00320528,
        -0.0187212 , -0.11424013,  0.05072112, -0.02217848], dtype=float32),
 array([ 0.16117923,  0.06269649, -0.13682964, 

In [8]:
import numpy as np
#Crea una lista di liste
dtv= np.array(tweet2vec).tolist()
#inserisce la list tra le colonne del dataframe
df['tweet2vec'] = dtv
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,user,timestamp,clean,pos,neg,neu,compound,value,week\r,tweet2vec
0,0,2019-05-10,IdeasPrime,2019-05-10 21:42:01+00,btc usd btcusd btc usd btc fibs long short wcx,0.0,0.0,1.0,0.0,bull\r\r,19,"[0.12421877682209015, -0.01608162932097912, 0...."
1,4,2019-05-10,harbington,2019-05-10 13:20:10+00,eth also possible ethereum pump time bitcoin d...,0.5859,0.081,0.704,0.215,bull\r\r,19,"[-0.08004117757081985, -0.06177661940455437, -..."


In [9]:
#trova i documenti più simili
model.dv.most_similar(1)

[(563075, 0.6433509588241577),
 (573362, 0.6373852491378784),
 (426576, 0.636681318283081),
 (13223, 0.6200230717658997),
 (570068, 0.6102948784828186),
 (179431, 0.6058738827705383),
 (222672, 0.605529248714447),
 (34164, 0.6035187244415283),
 (194992, 0.5991600155830383),
 (171509, 0.595057487487793)]

In [26]:
test = "ath coming".split()
print(model.dv.most_similar([model.infer_vector(test)]))

[(91267, 0.7745657563209534), (472710, 0.7718918323516846), (54320, 0.7630526423454285), (476990, 0.7587276101112366), (72216, 0.7586725354194641), (26872, 0.7573115825653076), (420509, 0.7537076473236084), (99013, 0.7533555626869202), (539318, 0.7508599758148193), (381479, 0.7502236366271973)]


In [43]:
print(df.loc[91267])

Unnamed: 0                                                12939
date                                                 2019-06-25
user                                                     CamBTC
timestamp                                2019-06-25 23:57:33+00
clean                         expecting btc go ath alts recover
pos                                                           0
neg                                                           0
neu                                                           1
compound                                                      0
value                                                  bull\r\r
week\r                                                       26
tweet2vec     [0.04444393143057823, 0.011828224174678326, 0....
Name: 91267, dtype: object


In [37]:
print(df.loc[72216])

Unnamed: 0                                                45842
date                                                 2019-06-08
user                                                 SARL_SAGER
timestamp                                2019-06-08 03:35:11+00
clean                                       bitcoin moon boys\r
pos                                                           0
neg                                                           0
neu                                                           1
compound                                                      0
value                                                  bull\r\r
week\r                                                       23
tweet2vec     [0.020176200196146965, -0.019622448831796646, ...
Name: 72216, dtype: object


In [27]:
print(df.loc[472710])

Unnamed: 0                                                28994
date                                                 2019-06-26
user                                                RipzHustles
timestamp                                2019-06-26 20:24:53+00
clean                                      bitcoin ath soooooon
pos                                                           0
neg                                                           0
neu                                                           1
compound                                                      0
value                                                  bull\r\r
week\r                                                       26
tweet2vec     [0.08409928530454636, 0.017843421548604965, 0....
Name: 472710, dtype: object


In [30]:
test = "bitcoin value going up".split()
print(model.dv.most_similar([model.infer_vector(test)]))

[(449141, 0.7363398671150208), (10874, 0.7312799096107483), (21303, 0.695338785648346), (576569, 0.6950658559799194), (532008, 0.6943750381469727), (459119, 0.693573534488678), (473521, 0.6920057535171509), (267829, 0.6856286525726318), (419211, 0.6828756928443909), (40677, 0.6744663119316101)]


In [31]:
print(df.loc[449141])

Unnamed: 0                                                12492
date                                                 2019-06-04
user                                              theGoddessDri
timestamp                                2019-06-04 16:43:49+00
clean                       btc value going knew would happen\r
pos                                                       0.324
neg                                                           0
neu                                                       0.676
compound                                                   0.34
value                                                  bear\r\r
week\r                                                       23
tweet2vec     [0.10508862882852554, 0.01932787522673607, 0.0...
Name: 449141, dtype: object


In [51]:
#Verifica del tweet prima della pulizia del testo
df1 = pd.read_csv('.../drop2-4.csv', delimiter=',', skiprows=0, lineterminator='\n', low_memory=False)
df1

Unnamed: 0.1,Unnamed: 0,date,user,timestamp,text,compound,neg,neu,pos\r
0,2000001,2019-05-13,GigaBitcoin,2019-05-13 18:05:18+00,Bitcoin at the Races: Overall Market Cap of #1...,0.0000,0.000,1.000,0.000
1,2000002,2019-05-13,cocainum_,2019-05-13 18:05:18+00,@BitBase_es A few days ago I saw a dude tradi...,-0.4871,0.083,0.917,0.000
2,2000003,2019-05-13,ojo_azul,2019-05-13 18:05:18+00,"Arbistar 2.0, the Spanish version of bitcoin a...",0.0000,0.000,1.000,0.000
3,2000004,2019-05-13,ojo_azul,2019-05-13 18:05:14+00,"Arbistar 2.0, the Spanish version of bitcoin a...",0.0000,0.000,1.000,0.000
4,2000005,2019-05-13,Excellion,2019-05-13 18:05:15+00,@hivedotone @WhalePanda @TraceMayer @bendavenp...,0.4215,0.000,0.781,0.219
...,...,...,...,...,...,...,...,...,...
1999994,3999995,2019-06-18,joshgnosis,2019-06-18 11:08:20+00,@swearyanthony @crankynick I just remember it ...,0.0000,0.000,1.000,0.000
1999995,3999996,2019-06-18,Vimalth82226643,2019-06-18 11:08:19+00,@cryptoastblog These Three Reasons Are Pushing...,0.3182,0.000,0.929,0.071
1999996,3999997,2019-06-18,FixedFloat,2019-06-18 11:08:17+00,Exchange rates for today💚\n\n#BTC #ETH https:/...,0.0000,0.000,1.000,0.000
1999997,3999998,2019-06-18,gaborgurbacs,2019-06-18 11:07:53+00,@TheStalwart Yes. The Libra-killer was unveile...,0.4019,0.000,0.803,0.197


In [52]:
pd.set_option('display.max_colwidth',700)

In [53]:
df1[df1.user== 'theGoddessDri']

Unnamed: 0.1,Unnamed: 0,date,user,timestamp,text,compound,neg,neu,pos\r
1338444,3338445,2019-06-04,theGoddessDri,2019-06-04 16:43:49+00,"Btc value is going up again, I knew that would happen.\r",0.34,0.0,0.789,0.211
