# Word vector differences across groups

In [36]:
import pandas as pd
from nltk import download, tokenize, word_tokenize 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import numpy as np 

def preprocess_word(doc):
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    doc = [w for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    while (doc.count('n')): 
        doc.remove('n') 
    while (doc.count('br')): 
        doc.remove('br') 
    return doc

In [7]:
sample_tweets_df = pd.read_csv('data/sample_with_label_and_clusters.csv')
sample_tweets_df['TweetTextNew'] = sample_tweets_df['TweetText'].apply(preprocess_word)

Extracted word vector files for each cluster can be found in the google drive link in the readme.md file, which must be placed in the data/ directory for the next few cells to work.

In [9]:
import pickle

In [32]:
k5_clusters = sample_tweets_df['K5'].unique()
models = []
for k in k5_clusters:
    filename = 'data/wordvectors_from_cluster'+str(k)+'.pkl'
    filehandler = open(filename,'rb')
    models.append(pickle.load(filehandler)) 

In [34]:
for model in models:
    print(len(model.vocabulary.cum_table))

136296
74257
55574
57354
52265


The vocabulary size is different between each resulting word vector set, so we can compare between common words that exist in both sets.

In [37]:
vocab = []
for tweetlist in sample_tweets_df['TweetTextNew'].values:
    for tweet in tweetlist:
        vocab.append(tweet)

# Keep only those words that occured more than 250 times
unique_elements, counts_elements = np.unique(vocab, return_counts=True)
vocab_df = pd.DataFrame(unique_elements)
vocab_df.columns = ['vocab_word']
vocab_df['count'] = counts_elements
constrained_vocab = vocab_df[vocab_df['count']>250]['vocab_word'].values
constrained_vocab[0:5]

array(['able', 'absolute', 'absolutely', 'abt', 'abuse'], dtype=object)

Let's compare between two different word vector groups

In [41]:
a = []
b = []
Table = []
Row = []
for word in constrained_vocab:
    for switch in [2,1]:
        if switch == 2:
            try:
                a = models[2].wv[word]
            except:
                continue
        if switch == 1:
            try:
                b = models[4].wv[word]
            except:
                continue
    Row.append(word)
    Row.append(np.linalg.norm(a-b))
    Table.append(Row)
    Row = []
wordDiff = pd.DataFrame(Table)
wordDiff.columns = ['word','vector_diff_between_clusters_2_and_0']
wordDiff.sort_values(by='vector_diff_between_clusters_2_and_0',ascending=False).head(15)

Unnamed: 0,word,vector_diff_between_clusters_2_and_0
1103,la,14.321951
1602,que,13.393438
475,de,13.338776
621,es,12.883841
381,completa,12.87316
1711,saga,12.834828
606,encasitaconchollometro,12.804353
604,en,12.761955
596,el,12.225507
689,favorito,11.891595


It's surprising to see that despite seeding the twitter dataset with english words, when we attempt to compare the linguistic characteristics between the mutual-following network clusters by inspecting the highest word vector distances, the main differences seem to be from them comprising of actual different languages. 