In [2]:
import os

import pandas as pd
from elasticsearch import Elasticsearch, helpers
from os.path import join

es = Elasticsearch()
project_dir = join(os.getcwd(), os.pardir)
process_dir = join(project_dir, 'data', 'processed')

In [2]:
i = 0
resp = es.search(
    index="twitter",
     body={
        "size":10000,
        "query":{
            "bool":{
                "must":{
                    "term":{
                        "is_retweet":False
                    }
                }
            }
        }
    },
    scroll='3s'
)
data = []
old_scroll_id = resp['_scroll_id']
data = data + [{"text":x["_source"]["full_text_processed"] , "id":x["_id"]} for x in resp['hits']['hits']]

    
# use a 'while' iterator to loop over document 'hits'
while len(resp['hits']['hits']):

    # make a request using the Scroll API
    resp = es.scroll(
        scroll_id = old_scroll_id,
        scroll = '2s' # length of time to keep search context
    )

    # check if there's a new scroll ID
    if old_scroll_id != resp['_scroll_id']:
        print ("NEW SCROLL ID:", resp['_scroll_id'])

    # keep track of pass scroll _id
    old_scroll_id = resp['_scroll_id']
    i += 1
    data = data + [{"text":x["_source"]["full_text_trans"] , "id":x["_id"]} for x in resp['hits']['hits']]
        
len(data)

113342

In [7]:
df = pd.DataFrame(data)
df.to_csv(os.path.join(process_dir,'text_translated.csv'))

In [14]:
df = pd.read_csv(os.path.join(process_dir,'text_translated.csv'), lineterminator='\n')
for idx,row in df.iterrows():
    print(row['id'])
    body ={
        'doc':{
            'full_text_trans':row['text'],
        }
    }
    es.update(
        index='twitter',
        id=row['id'],
        body = body
    )

Unnamed: 0                                                    0
text          agree bengal extreme crisis people facing wate...
id                                          1263637941055819776
Name: 0, dtype: object


In [4]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc["text"], [doc["id"]]) for doc in data]

2020-08-25 01:04:01,739 - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
2020-08-25 01:04:01,740 - gensim.corpora.dictionary - INFO - built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [5]:
documents[0:5]

[TaggedDocument(words='cyclone amphan uprooted tree block road odishas bhadrak district read', tags=['1262984574683865088']),
 TaggedDocument(words='energyodisha damage caused cyclone amphan marshaghai atdarabachha', tags=['1262984551665672192']),
 TaggedDocument(words='mahuamoitra cyclone amphan 3 day notice let u see preparedness lack mr naveen patnaik limited casualty le 100 person cyclone fani let u see post mortem report devastation ha taken place continue blame fm also', tags=['1262984547693518848']),
 TaggedDocument(words='derekobrienmp cyclone amphan dealing prepared usual going blame centre devastation ha taken place going say centre gave 4 hour notice preparing cyclone', tags=['1262981202975223808']),
 TaggedDocument(words='cyclone amphan live update hour landfall million brace bangladesh india', tags=['1262984537123897344'])]

In [6]:
model = Doc2Vec(documents, vector_size=200, window=2, min_count=1, epochs=50)

2020-08-25 01:04:13,606 - gensim.models.doc2vec - INFO - collecting all words and their counts
2020-08-25 01:04:13,608 - gensim.models.doc2vec - INFO - PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-08-25 01:04:13,700 - gensim.models.doc2vec - INFO - PROGRESS: at example #10000, processed 1098617 words (12082008/s), 613 word types, 10000 tags
2020-08-25 01:04:13,785 - gensim.models.doc2vec - INFO - PROGRESS: at example #20000, processed 2157859 words (12536520/s), 830 word types, 20000 tags
2020-08-25 01:04:13,867 - gensim.models.doc2vec - INFO - PROGRESS: at example #30000, processed 3215347 words (13110926/s), 1055 word types, 30000 tags
2020-08-25 01:04:13,951 - gensim.models.doc2vec - INFO - PROGRESS: at example #40000, processed 4239033 words (12487859/s), 1196 word types, 40000 tags
2020-08-25 01:04:14,034 - gensim.models.doc2vec - INFO - PROGRESS: at example #50000, processed 5273689 words (12627966/s), 1419 word types, 50000 tags
2020-08-25 01:04:14

2020-08-25 01:05:00,822 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 15.45% examples, 187212 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:05:01,826 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 23.29% examples, 185689 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:05:02,846 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 31.35% examples, 185387 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:05:03,852 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 38.70% examples, 181542 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:05:04,856 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 45.83% examples, 180384 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:05:05,869 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 53.23% examples, 179412 words/s, in_qsize 6, out_qsize 1
2020-08-25 01:05:06,891 - gensim.models.base_any2vec - INFO - EPOCH 3 - PROGRESS: at 60.52% examples, 181244 words/s, in_qsize 6, out_qsize 0
2020-0

2020-08-25 01:05:49,681 - gensim.models.base_any2vec - INFO - EPOCH 6 - PROGRESS: at 86.09% examples, 190404 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:05:50,684 - gensim.models.base_any2vec - INFO - EPOCH 6 - PROGRESS: at 93.59% examples, 192118 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:05:51,502 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-08-25 01:05:51,513 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-08-25 01:05:51,515 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-08-25 01:05:51,516 - gensim.models.base_any2vec - INFO - EPOCH - 6 : training on 12534904 raw words (2493774 effective words) took 12.9s, 193145 effective words/s
2020-08-25 01:05:52,523 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 7.76% examples, 193477 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:05:53,530 - gensim.models.base_an

2020-08-25 01:06:36,557 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 45.27% examples, 177315 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:06:37,560 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 52.61% examples, 177322 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:06:38,561 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 60.07% examples, 180413 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:06:39,564 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 67.88% examples, 181951 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:06:40,578 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 75.72% examples, 182858 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:06:41,583 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 83.65% examples, 185153 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:06:42,599 - gensim.models.base_any2vec - INFO - EPOCH 10 - PROGRESS: at 91.54% examples, 186832 words/s, in_qsize 6, out_qsize 0

2020-08-25 01:07:22,750 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-08-25 01:07:22,751 - gensim.models.base_any2vec - INFO - EPOCH - 13 : training on 12534904 raw words (2492699 effective words) took 12.9s, 193617 effective words/s
2020-08-25 01:07:23,771 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 7.82% examples, 192166 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:07:24,772 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 15.71% examples, 189477 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:07:25,786 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 23.46% examples, 186061 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:07:26,791 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 31.35% examples, 185394 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:07:27,805 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 39.32% examples, 184070 words/s, in_qsize 5, out_qsize

2020-08-25 01:08:11,001 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 67.39% examples, 179872 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:08:12,016 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 75.21% examples, 180778 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:08:13,020 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 82.66% examples, 182336 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:08:14,021 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 90.22% examples, 183572 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:08:15,030 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 98.02% examples, 185992 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:08:15,275 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-08-25 01:08:15,277 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-08-25 01:08:15,281 - gensim.models.base_

2020-08-25 01:08:56,553 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 15.71% examples, 190047 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:08:57,556 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 23.55% examples, 187847 words/s, in_qsize 5, out_qsize 1
2020-08-25 01:08:58,583 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 31.62% examples, 186678 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:08:59,591 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 39.49% examples, 184736 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:09:00,596 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 46.83% examples, 185500 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:09:01,608 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 54.78% examples, 184957 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:09:02,632 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 62.31% examples, 186737 words/s, in_qsize 6, out_qsize 0

2020-08-25 01:09:45,966 - gensim.models.base_any2vec - INFO - EPOCH 24 - PROGRESS: at 91.14% examples, 185425 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:09:46,968 - gensim.models.base_any2vec - INFO - EPOCH 24 - PROGRESS: at 98.82% examples, 187449 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:09:47,113 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-08-25 01:09:47,115 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-08-25 01:09:47,117 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-08-25 01:09:47,118 - gensim.models.base_any2vec - INFO - EPOCH - 24 : training on 12534904 raw words (2491003 effective words) took 13.3s, 187790 effective words/s
2020-08-25 01:09:48,124 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 7.60% examples, 188897 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:09:49,127 - gensim.models.bas

2020-08-25 01:10:31,306 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 39.25% examples, 183767 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:10:32,308 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 46.97% examples, 186229 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:10:33,317 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 54.37% examples, 183645 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:10:34,320 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 61.51% examples, 185023 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:10:35,321 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 68.57% examples, 183718 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:10:36,333 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 75.89% examples, 183132 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:10:37,346 - gensim.models.base_any2vec - INFO - EPOCH 28 - PROGRESS: at 83.32% examples, 184165 words/s, in_qsize 6, out_qsize 0

2020-08-25 01:11:18,662 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-08-25 01:11:18,665 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-08-25 01:11:18,666 - gensim.models.base_any2vec - INFO - EPOCH - 31 : training on 12534904 raw words (2493651 effective words) took 13.2s, 189448 effective words/s
2020-08-25 01:11:19,674 - gensim.models.base_any2vec - INFO - EPOCH 32 - PROGRESS: at 7.82% examples, 194831 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:11:20,686 - gensim.models.base_any2vec - INFO - EPOCH 32 - PROGRESS: at 15.62% examples, 188922 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:11:21,698 - gensim.models.base_any2vec - INFO - EPOCH 32 - PROGRESS: at 23.64% examples, 187616 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:11:22,699 - gensim.models.base_any2vec - INFO - EPOCH 32 - PROGRESS: at 31.25% examples, 185217 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:11:23,

2020-08-25 01:12:05,777 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 58.55% examples, 173629 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:12:06,783 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 65.64% examples, 174605 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:07,786 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 72.69% examples, 174913 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:08,794 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 80.04% examples, 175974 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:09,808 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 87.90% examples, 177825 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:10,811 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 95.47% examples, 180676 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:11,397 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-08-25 01:12:11,39

2020-08-25 01:12:51,496 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 7.47% examples, 185028 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:12:52,503 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 15.43% examples, 187292 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:53,514 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 23.46% examples, 186658 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:54,534 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 31.62% examples, 186440 words/s, in_qsize 5, out_qsize 0
2020-08-25 01:12:55,539 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 39.43% examples, 184382 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:56,539 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 47.05% examples, 186393 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:12:57,542 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 54.59% examples, 184567 words/s, in_qsize 5, out_qsize 0


2020-08-25 01:13:40,692 - gensim.models.base_any2vec - INFO - EPOCH 42 - PROGRESS: at 83.63% examples, 184815 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:13:41,693 - gensim.models.base_any2vec - INFO - EPOCH 42 - PROGRESS: at 91.40% examples, 186438 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:13:42,698 - gensim.models.base_any2vec - INFO - EPOCH 42 - PROGRESS: at 99.13% examples, 188400 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:13:42,794 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-08-25 01:13:42,798 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-08-25 01:13:42,799 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-08-25 01:13:42,800 - gensim.models.base_any2vec - INFO - EPOCH - 42 : training on 12534904 raw words (2492733 effective words) took 13.2s, 188840 effective words/s
2020-08-25 01:13:43,845 - gensim.models.ba

2020-08-25 01:14:26,218 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 30.71% examples, 182961 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:14:27,235 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 38.70% examples, 181875 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:14:28,240 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 46.60% examples, 185099 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:14:29,246 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 54.23% examples, 183396 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:14:30,249 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 61.65% examples, 185779 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:14:31,252 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 69.38% examples, 186951 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:14:32,255 - gensim.models.base_any2vec - INFO - EPOCH 46 - PROGRESS: at 77.03% examples, 186454 words/s, in_qsize 6, out_qsize 0

2020-08-25 01:15:14,489 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-08-25 01:15:14,492 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-08-25 01:15:14,492 - gensim.models.base_any2vec - INFO - EPOCH - 49 : training on 12534904 raw words (2491932 effective words) took 13.0s, 191417 effective words/s
2020-08-25 01:15:15,505 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 7.53% examples, 185429 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:15:16,515 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 15.27% examples, 184380 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:15:17,521 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 23.37% examples, 185723 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:15:18,532 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 31.25% examples, 184792 words/s, in_qsize 6, out_qsize 0
2020-08-25 01:15:19,

In [7]:
model.save(join(project_dir, 'models', 'tweet2VecJared.model'))

2020-08-25 01:15:27,392 - gensim.utils - INFO - saving Doc2Vec object under /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model, separately None
2020-08-25 01:15:27,393 - gensim.utils - INFO - storing np array 'vectors_docs' to /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model.docvecs.vectors_docs.npy
2020-08-25 01:15:27,748 - gensim.utils - INFO - saved /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model


In [8]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model = Doc2Vec.load(join(project_dir, 'models', 'tweet2VecJared.model'))
list(model.docvecs.doctags.keys())

2020-08-25 01:15:27,758 - gensim.utils - INFO - loading Doc2Vec object from /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model
2020-08-25 01:15:27,881 - gensim.utils - INFO - loading vocabulary recursively from /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model.vocabulary.* with mmap=None
2020-08-25 01:15:27,883 - gensim.utils - INFO - loading trainables recursively from /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model.trainables.* with mmap=None
2020-08-25 01:15:27,883 - gensim.utils - INFO - loading wv recursively from /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model.wv.* with mmap=None
2020-08-25 01:15:27,884 - gensim.utils - INFO - loading docvecs recursively from /home/jaredross/personal/solve-iwmi/notebooks/../models/tweet2VecJared.model.docvecs.* with mmap=None
2020-08-25 01:15:27,885 - gensim.utils - INFO - loading vectors_docs from /home/jaredross/personal/solve-iwmi/not

['1262984574683865088',
 '1262984551665672192',
 '1262984547693518848',
 '1262981202975223808',
 '1262984537123897344',
 '1262968409828012032',
 '1262957704982781952',
 '1262957248432812032',
 '1262948704132218880',
 '1262934318113947648',
 '1262930543215329280',
 '1262942946896347136',
 '1262967878065819648',
 '1262931388988133376',
 '1262984510507016192',
 '1262976549692887040',
 '1262954298520014848',
 '1262946567486607360',
 '1262937944136069120',
 '1262984454450016256',
 '1262984448439508992',
 '1262984443263885312',
 '1262981515148849152',
 '1262984408795144192',
 '1262959317629755392',
 '1262936308818862080',
 '1262984408019218432',
 '1262972713548894208',
 '1262942198372413440',
 '1262938239435866112',
 '1262984407553638400',
 '1262984403426385920',
 '1262984376779976704',
 '1262969639329574912',
 '1262961907729936384',
 '1262984327526318080',
 '1262984321545183232',
 '1262965567943774208',
 '1262951145162641408',
 '1262984280617242624',
 '1262984265538600960',
 '12629791184356

In [9]:
len(list(model.docvecs.doctags.keys()))

113342

In [10]:
import pickle
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

if 1==1:
    for i in range(0,int(len(list(model.docvecs.doctags.keys()))/10000)):
        start = (i*10000)
        end = (i*10000)+10000
        print(f'{start} to {end}')
        doc_tags = list(model.docvecs.doctags.keys())
        X = model[doc_tags[start:end]]

        tsne = TSNE()
        X_tsne = tsne.fit_transform(X)
        df = pd.DataFrame(X_tsne, index=doc_tags[start:end], columns=['x', 'y'])
        with open(join(project_dir, 'models', f'tnse_{i}.model'), 'wb') as f:
            pickle.dump(tsne, f)

        pd.np.save(join(project_dir, 'models', f'tnse_vectors_{i}.npy'), X_tsne)
        
with open(join(project_dir, 'models', f'tnse_0.model'),'rb') as f:
    tsne = pickle.load(f)
    
df = pd.np.load(join(project_dir, 'models', f'tnse_vectors_0.npy'))

df = pd.DataFrame(df,
                index=pd.Index(list(model.docvecs.doctags.keys())[0:10000]),
                columns=[u'x', u'y'])

0 to 10000




10000 to 20000




20000 to 30000




30000 to 40000




40000 to 50000




50000 to 60000




60000 to 70000




70000 to 80000




80000 to 90000




90000 to 100000




100000 to 110000




In [11]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()
df[u'tweet_id'] = df.index
df.head()

Unnamed: 0,x,y,tweet_id
1262984574683865088,11.493409,44.003239,1262984574683865088
1262984551665672192,-19.993698,3.520919,1262984551665672192
1262984547693518848,-15.968758,-15.525149,1262984547693518848
1262981202975223808,33.556671,16.178394,1262981202975223808
1262984537123897344,-60.729088,23.102188,1262984537123897344


In [12]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(df)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@tweet_id') )

# draw the words as circles on the plot
tsne_plot.circle(u'x', u'y', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);
