# Analysing Tweet Vectors

In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
import nltk
import eland as ed
import matplotlib.pyplot as plt
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
project_dir = join(os.getcwd(), os.pardir)
models_dir = join(project_dir, 'models')

## Import data from Elasticsearch

In [9]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text_processed', 'user_id', 'verified', 'name', 'location', 'entities.hashtags.text', 'entities.user_mentions.name'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"true"},
        }
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [10]:
df_tweets = df_tweets.drop_duplicates('full_text_processed')

In [11]:
df_tweets['length'] = df_tweets['full_text_processed'].apply(lambda x: len([w for w in x.split()]))
df_tweets = df_tweets[df_tweets['length']>4]

In [12]:
df_tweets.shape

(22326, 8)

In [13]:
df_tweets.head()

Unnamed: 0,full_text_processed,user_id,verified,name,location,entities.hashtags.text,entities.user_mentions.name,length
1264160611157643264,yesterday hon’ble shri ji made announcement r ...,999599078043365377,False,Adrija Banerjee,"Kolkata, India",,"[NSitharamanOffice, PMO India, Narendra Modi]",30
1264114392234430464,amidst people’s protest street kolkata non res...,999599078043365377,False,Adrija Banerjee,"Kolkata, India",,Amit Malviya,30
1264160578932813824,cyclone amphan vicious locust storm fast appro...,858598202,False,حبیب,"Doha, Qatar",coronavirus,Ashok Swain,11
1264160565540388864,speaking situation wake cyclone amphan,973281092,False,Puneet,"Noida, India",,Narendra Modi,5
1264160559752249344,prayer thought amp love affected devastation c...,937394260051034112,False,ᴍʀ.ғᴀɴ,"Chiplun, India",,Shah Rukh Khan,28


## Load the Tweet2Vec Model

In [7]:
## Loading the tweet2vec model
model = Doc2Vec.load(join(models_dir,'tweet2VecJared.model'))
doc_tags = list(model.docvecs.doctags.keys())   ## Tweet Ids
doc_vectors = model.docvecs.vectors_docs        ## Tweet Vectors

In [16]:
doc_tags.index('1264160578932813824')

4442

In [4]:
# Clustering Kmeans
km_model = KMeans(n_clusters=10, n_jobs=-1, random_state=10)
km_model.fit(doc_vectors)
# Get cluster assignment labels
labels = km_model.labels_

In [18]:
model.docvecs.most_similar(4442, topn=15)

[('1264135768747933696', 0.9335355758666992),
 ('1264146417683218432', 0.9327142238616943),
 ('1264152710921637888', 0.9301825165748596),
 ('1264134726249406464', 0.9243108034133911),
 ('1264128163950592000', 0.9156315326690674),
 ('1264145683302494208', 0.9147990942001343),
 ('1264161210678665216', 0.9132680892944336),
 ('1264135483229196288', 0.9080426692962646),
 ('1264150953890062336', 0.9067995548248291),
 ('1264161672693874688', 0.9056249260902405),
 ('1264111689974386688', 0.9009899497032166),
 ('1264134887193276416', 0.8977228403091431),
 ('1264143915776724992', 0.8969724178314209),
 ('1264134220928163840', 0.8965578079223633),
 ('1264137819276095488', 0.896438479423523)]

In [None]:
# Clustering DBScan
dbscan_model = DBSCAN()
labels = dbscan_model.fit_predict(doc_vectors)