In [1]:
from minisom import MiniSom
from sklearn_som.som import SOM
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import warnings
import re
import string

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rao.ans/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rao.ans/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rao.ans/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/rao.ans/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("../data/user_info/popular_tweets.csv.gz")

In [5]:
df.head()

Unnamed: 0,text,created_at,bear_bull_tag,user_name,user_id,message_id,ticker
0,$AB | AllianceBernstein Q4 21 Earnings: \nAdj ...,2022-02-11T11:27:18Z,,LiveSquawk,130351,435511902,AB
1,"Nancy Pelosi Buys Tesla Calls, Stands To Benef...",2021-01-26T19:33:19Z,,Benzinga,7108,277882388,AB
2,3 Big Dividend Stocks Yielding 7% — or More; E...,2020-09-28T13:45:47Z,,TipRanks,217593,246681035,AB
3,Value Investing Is Alive And Well: 5 Picks. $S...,2020-08-11T21:23:36Z,,ZacksResearch,82492,235582933,AB
4,$AB stalking as a potential swing long above 2...,2020-06-18T16:30:28Z,Bullish,ACInvestorBlog,2503,221060450,AB


In [6]:
def clean_text(txt):
    """
    Clean text in the dataset.
    
    :param txt: txt string that is present in the input file.
    :type  txt: str
    :return: string that has been cleaned using Wordnet Lemmatizer, etc. 
    :rtype: str
    """
    
    txt = txt.lower()  # convert to lowercase
    txt = re.sub(r'^https?:\/\/.*[\s]*', '', txt)  # remove links
    words = txt.split(" ")
    # remove tickers
    words_without_tickers = []
    for w in words:
        if w.startswith("$") and len(w) > 1:
            continue
        words_without_tickers += [w]
    non_ticker_text = " ".join(words_without_tickers)
    words = nltk.tokenize.word_tokenize(non_ticker_text)  # tokenize
    words = [word if word.isalpha() else '' for word in words]  # only retains words, not numbers, etc.
    lemmatizer = nltk.wordnet.WordNetLemmatizer()
    stop_words = set(nltk.corpus.stopwords.words('english')).union(set(["hey", "http", "https", 
                                                                        "u", "im", "amp"]))
    final_tokens = []
    for w in words:
        w = "".join(["" if c in string.punctuation else c for c in w])  # remove punctutation
        if w != "" and w not in stop_words:  # process only non stopwords
            final_tokens.append(lemmatizer.lemmatize(w))  # using WordNet for lemmatizing
    cleaned_txt = " ".join(final_tokens)
    return cleaned_txt

In [7]:
# removing bots
df = df[~df.user_name.isin(["OpenOutcrier", "briefingcom", "Estimize"])]

In [8]:
df['text'] = df['text'].apply(lambda r: clean_text(r))

### TF-IDF

In [9]:
vectorizer = TfidfVectorizer(stop_words=set(nltk.corpus.stopwords.words('english')), 
                             max_features=1000)
X = vectorizer.fit_transform(df.text)

In [10]:
feature_names = vectorizer.get_feature_names_out()

### MiniSom
https://github.com/JustGlowing/minisom

In [11]:
D = X.todense().tolist()

In [12]:
M = 3  # lattice dimension
som = MiniSom(M, M, X.shape[1])

In [13]:
som.pca_weights_init(D)
som.train(D, 50000, random_order=False, verbose=True)

 [ 50000 / 50000 ] 100% - 0:00:00 left 
 quantization error: 0.9920790629896886


In [14]:
top_keywords = 10

In [15]:
weights = som.get_weights()

In [16]:
cnt = 1
for i in range(M):
    for j in range(M):
        keywords_idx = np.argsort(weights[i,j,:])[-top_keywords:]
        keywords = ' '.join([feature_names[k] for k in keywords_idx])
        print('Topic', cnt, ':', keywords)
        cnt += 1

Topic 1 : trading longs hard going parabolic position profit congrats ran alerted
Topic 2 : study offering stock positive daily fda pulse result biotech surge
Topic 3 : watch longs close level holding overnight congrats parabolic squeeze go
Topic 4 : congrats mover market pre pm pt today part top gainer
Topic 5 : data continuation turn watchlist breaking hard move running partnership starting
Topic 6 : monster looking hot week several setup gapping set ups energy
Topic 7 : parabolic pre higher lower hour part yesterday biggest top mover
Topic 8 : possible trading new stock pro benzinga upside international watch today
Topic 9 : rated rate maintained security capital success stock analyst reiterated buy


### 2. sklearn-som
https://github.com/rileypsmith/sklearn-som

In [17]:
som = SOM(M, M, X.shape[1])
som.fit(X.toarray())

In [18]:
weights = som.weights

In [19]:
cnt = 1
for i in range(M * M):
    keywords_idx = np.argsort(weights[i])[-top_keywords:]
    keywords = ' '.join([feature_names[k] for k in keywords_idx])
    print('Topic', cnt, ':', keywords)
    cnt += 1

Topic 1 : eps million week name sale general morning mover tariff ipo
Topic 2 : stock today read gap good trade etf hour morning mover
Topic 3 : china increase big high mover top near quot today stock
Topic 4 : long week etf new tell time next morning earnings name
Topic 5 : sector real broke move top morning market vehicle stock etf
Topic 6 : analyst new market today dividend etf gainer buy top stock
Topic 7 : trade report amd morning name close say stock week earnings
Topic 8 : amid vaccine testing analyst etf say buy coronavirus stock earnings
Topic 9 : success notable say dip top upgrade maintained analyst stock buy
