In [39]:
import numpy as np 
import pandas as pd
import re 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from re import sub

from tabulate import tabulate
from tqdm import trange 
import random 
import multiprocessing
from sklearn.cluster import KMeans

In [40]:
from time import time 
from unidecode import unidecode
from gensim.models import Word2Vec
from collections import defaultdict
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.models.phrases import Phrases, Phraser

In [41]:
from cleantext import clean

In [42]:
stop_words = set(stopwords.words('english'))

In [43]:
tweets = pd.read_csv("elonmusk_2021-11-26.csv", index_col = False)
tweets = tweets.drop(columns="Unnamed: 0")

In [44]:
tweets = tweets.sort_values(by=['Datetime'])


In [45]:
# using clean to remove the emojis from all tweets 
def clean_text(x): 
    x = clean(x, no_emoji=True)
    return x 

def remove_stop_word(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)
    text = text.split()
    word_list = [w for w in text if w not in stop_words] 
    return word_list
        

# tweets.Text = tweets.Text.apply(clean_text)

In [46]:
tweets["Word_List"] = tweets.Text.apply(remove_stop_word)

In [47]:
phrases = Phrases(tweets.Word_List, min_count=1, progress_per=500000)

In [48]:
bigram = Phraser(phrases)

In [49]:
sentences = bigram[tweets.Word_List]

In [50]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

w2v_model.vector_size = 300

start = time()

w2v_model.build_vocab(sentences, progress_per=50000) 

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay = 1)

print("Time to train the model: {} mins".format(round((time() -start)/ 60,2)))

w2v_model.init_sims(replace=True)

Time to train the model: 0.01 mins


  w2v_model.init_sims(replace=True)


In [51]:
temp = tweets.Word_List
temp.apply(lambda x: ' '.join(bigram[x]))

928                                dropping friends pool
927                                        splish splash
926                                                     
925          dayquil plus nyquil https_//t co/aadssrmjyf
924         blow_whistle tesla ! https_//t co/c86hla0iqk
                             ...                        
4      tesla full self_driving beta available anyone ...
3      might notice small sometimes major improvement...
2      people_spoken amnesty begins next week vox_pop...
1      thanksgiving cuisine delightful symphony flavor !
0                                    think culture war ?
Name: Word_List, Length: 929, dtype: object

In [52]:
word_vectors = w2v_model.wv
model_k = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model_k.cluster_centers_[0]
negative_cluster_center = model_k.cluster_centers_[1]



In [53]:
model_k.cluster_centers_

array([[-0.01146346,  0.01488798,  0.01693169,  0.00333735, -0.00217718,
        -0.00721252, -0.02699738,  0.01446479, -0.00935097,  0.02400909,
        -0.02089914, -0.03029446,  0.00550795,  0.0205628 , -0.00220107,
         0.0030316 ,  0.00514591, -0.00142077,  0.03283321,  0.01246648,
        -0.00125937, -0.00296437,  0.01708045,  0.00562681, -0.00270615,
         0.01371288,  0.00231122,  0.00459115, -0.00660328,  0.01286782,
         0.01556407, -0.00692473,  0.01921166, -0.00997226, -0.02944162,
        -0.00334534,  0.01559372,  0.00052092,  0.0127546 , -0.00779379,
        -0.00309386,  0.00161669, -0.0188576 , -0.02002946, -0.01632452,
         0.00926724,  0.0009747 , -0.01292452,  0.00811902, -0.01221974,
         0.0118126 , -0.00226348, -0.01311588, -0.00454971,  0.00662278,
         0.00983229, -0.01070018, -0.01124404, -0.03175205,  0.00304037,
        -0.00538012, -0.00520442, -0.01326666,  0.01411611,  0.01959212,
        -0.00337276,  0.02670472, -0.01966602, -0.0

In [54]:
word_vectors.similar_by_vector(model_k.cluster_centers_[0], topn=10, restrict_vocab=None)

[('mention', 0.3219534158706665),
 ('use', 0.30631595849990845),
 ('team_making', 0.3022902309894562),
 ('great', 0.2996186912059784),
 ('tesla_ai', 0.29887640476226807),
 ('tesla_make', 0.2945714592933655),
 ('!', 0.29157179594039917),
 ('friend', 0.28394776582717896),
 ('tweets', 0.28337758779525757),
 ('10', 0.28198903799057007)]

In [55]:
words = pd.DataFrame(list(word_vectors.index_to_key))
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model_k.predict(np.array(x).reshape(1,-1)))

In [56]:
words.cluster 

0      [1]
1      [0]
2      [1]
3      [1]
4      [0]
      ... 
436    [1]
437    [1]
438    [0]
439    [0]
440    [0]
Name: cluster, Length: 441, dtype: object

In [57]:
words['cluster_value'] = [1 if i == 0 else -1 for i in words.cluster]

In [58]:
words['closeness_score'] = words.apply(lambda x: 1/(model_k.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [59]:
punctuation = ["!", "?", ">", "<", "-", ".", ":", "@", "%"]

index_list = []

for item in punctuation:
    if len(words.index[words['words'] == item]) != 0:
        index_list.append(words.index[words["words"] == item][0])
index_list

[1, 4]

In [60]:
word_duplicate = words

In [64]:
word_duplicate.drop([1,4], axis=0, inplace=True).reset_index()

KeyError: '[1, 4] not found in axis'

In [None]:
word_duplicate.reset_index(

In [21]:
words.to_csv("word_sentiment_coeff.csv")

## Cleaning Tweets 

In [22]:
tweets = tweets.reset_index(drop=True)
# df = df.drop(df[df.score < 50].index)
tweets = tweets.drop(tweets[tweets.Text == ""].index)


# Deep NN 