In [44]:
import numpy as np 
import pandas as pd
import re 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from re import sub

from tabulate import tabulate
from tqdm import trange 
import random 
import multiprocessing
from sklearn.cluster import KMeans

In [45]:
from time import time 
from unidecode import unidecode
from gensim.models import Word2Vec
from collections import defaultdict
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.models.phrases import Phrases, Phraser

In [2]:
from cleantext import clean

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
tweets = pd.read_csv("elonmusk_2021-11-26.csv", index_col = False)
tweets = tweets.drop(columns="Unnamed: 0")

In [5]:
tweets = tweets.sort_values(by=['Datetime'])


In [19]:
# using clean to remove the emojis from all tweets 
def clean_text(x): 
    x = clean(x, no_emoji=True)
    return x 

def remove_stop_word(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)
    text = text.split()
    word_list = [w for w in text if w not in stop_words] 
    return word_list
        

# tweets.Text = tweets.Text.apply(clean_text)

In [20]:
tweets["Word_List"] = tweets.Text.apply(remove_stop_word)

In [46]:
phrases = Phrases(tweets.Word_List, min_count=1, progress_per=500000)

In [49]:
bigram = Phraser(phrases)

In [58]:
sentences = bigram[tweets.Word_List]

In [60]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

w2v_model.vector_size = 300

start = time()

w2v_model.build_vocab(sentences, progress_per=50000) 

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay = 1)

print("Time to train the model: {} mins".format(round((time() -start)/ 60,2)))

w2v_model.init_sims(replace=True)

Time to train the model: 0.01 mins


  w2v_model.init_sims(replace=True)


In [62]:
temp = tweets.Word_List
temp.apply(lambda x: ' '.join(bigram[x]))

0                                  dropping friends pool
1                                          splish splash
3            dayquil plus nyquil https_//t co/aadssrmjyf
4           blow_whistle tesla ! https_//t co/c86hla0iqk
5      waste money silly apple cloth buy whistle inst...
                             ...                        
924    tesla full self_driving beta available anyone ...
925    might notice small sometimes major improvement...
926    people_spoken amnesty begins next week vox_pop...
927    thanksgiving cuisine delightful symphony flavor !
928                                  think culture war ?
Name: Word_List, Length: 919, dtype: object

In [65]:
word_vectors = w2v_model.wv
model_k = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model_k.cluster_centers_[0]
negative_cluster_center = model_k.cluster_centers_[1]



In [66]:
model_k.cluster_centers_

array([[-0.01012455,  0.01852418,  0.01101304, -0.00629772, -0.00051504,
        -0.01141429, -0.02486277,  0.01309604, -0.00254958,  0.02233625,
        -0.0176393 , -0.02761401,  0.00590977,  0.01542205,  0.00283082,
         0.00271091,  0.00893688,  0.00238325,  0.04406161,  0.00444396,
         0.00363386, -0.00409851,  0.01738828,  0.00671896,  0.00194588,
         0.01303354,  0.00647307,  0.00278872, -0.00961589,  0.00563234,
         0.01205131, -0.01227917,  0.00764619, -0.00765575, -0.01920667,
        -0.00739408,  0.00921538, -0.00232113,  0.02245195, -0.0094457 ,
        -0.00875925, -0.00252103, -0.01987667, -0.01472505, -0.01915696,
         0.00460049, -0.00573168, -0.00503451,  0.00221537, -0.01944723,
         0.01280442,  0.00319751, -0.01669665, -0.00462169,  0.00611876,
         0.00623939, -0.00799015,  0.00145962, -0.02098984,  0.0066717 ,
        -0.00784816, -0.00961786, -0.01638368,  0.01417754,  0.01959795,
        -0.00733548,  0.0188224 , -0.01913453, -0.0

In [67]:
word_vectors.similar_by_vector(model_k.cluster_centers_[0], topn=10, restrict_vocab=None)

[('anyone', 0.3554791510105133),
 ('raptor_2', 0.3015036880970001),
 ('list', 0.29193463921546936),
 ('great_work', 0.28133854269981384),
 ('spacex', 0.28007304668426514),
 ('tweet', 0.27449506521224976),
 ('rocket', 0.2713889181613922),
 ('almost', 0.2688022553920746),
 ('ukraine', 0.26417335867881775),
 ('voice', 0.2615019679069519)]

## Cleaning Tweets 

In [7]:
tweets = tweets.reset_index(drop=True)
# df = df.drop(df[df.score < 50].index)
tweets = tweets.drop(tweets[tweets.Text == ""].index)


Unnamed: 0,Datetime,Tweet ID,Text,Username
0,2021-11-29 23:52:29+00:00,1465468742570229767,just dropping some friends off at the pool,elonmusk
1,2021-11-30 00:13:51+00:00,1465474116622946307,splish splash,elonmusk
3,2021-11-30 22:59:18+00:00,1465817742632792065,dayquil + nyquil https://t.co/aadssrmjyf,elonmusk
4,2021-12-01 00:23:05+00:00,1465838829370228737,blow the whistle on tesla!\nhttps://t.co/c86hl...,elonmusk
5,2021-12-01 00:36:14+00:00,1465842137392680963,don't waste your money on that silly apple clo...,elonmusk


# Deep NN 

In [9]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [10]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


ValueError: The first argument to `Layer.call` must always be passed.