# 1.0 Initializations

In [1]:
!python -V

Python 3.7.10


In [2]:
#!pip install gensim

import pandas as pd
import numpy as np
import os
import timeit
import zipfile as zf
import pickle
import json
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_corpus, common_dictionary, common_texts
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer, WordNetLemmatizer
#snowball and lancaster stemmers also available
from nltk.tokenize import word_tokenize

Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used
[nltk_data] Downloading package wordnet to /opt/conda/nltk_data...
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /opt/conda/nltk_data...


In [3]:
#Code to unzip files in the Jupyter directories


# files = zf.ZipFile("archive.zip", 'r')
# files.extractall()
# files.close()

os.getcwd()

'/home/jovyan/work/reddit_data'

In [4]:
#FOR PARAMETER TUNING, CREATE SAVEFILE OF DIFFERENT PARAMETERS.

# df_save = pd.DataFrame(columns = ['run_num', 'max_feat', 'min_df', 'max_df', 'ngram_range',
#        'n_components', 'random_state', 'top_terms', 'coherence_type', "num_features",
#        'coherence', 'perplexity', 'cohe_folds', 'perp_folds'])
# BE CAREFUL! IF YOU RUN THE BELOW, YOU MAY OVERWRITE VALUABLE DATA!
# df_save.to_csv('GME_record.csv', index = False)
# del df_save

# 2.0 Definitions

## 2.1 Preprocessing (Lemmatization, Build List of Documents)

In [5]:
#Builds a list of text documents, where "folders" parameter is the directories.
#Removes duplicates inside and between folders

#TAKES THE CSV'S FROM DIFFERENT SUBREDDITS AND COMBINES THEM. DROPS IRRELEVANT FIELDS.
def build_and_simplify_dataframe(folders):
    
    full_df = pd.DataFrame(columns = ['id', 'title', 'selftext'])
    for fold in folders:
        df = pd.read_csv(fold + "/submissions_reddit.csv")
        df = df[['id', 'title', 'selftext']] 
        full_df = full_df.append(df)
    return full_df

#LEMMATIZE BY BREAKING APART AND REBUILDING
def lemmatize_breakup(doc):
    lemmatizer = WordNetLemmatizer()
    toks = word_tokenize(doc)
    new = [lemmatizer.lemmatize(t.lower(), pos = "v") for t in toks]
    new = [lemmatizer.lemmatize(t.lower(), pos = "n") for t in new]
    new = [lemmatizer.lemmatize(t.lower(), pos = "a") for t in new]
    new = [lemmatizer.lemmatize(t.lower(), pos = "r") for t in new]
    new = [lemmatizer.lemmatize(t.lower(), pos = "s") for t in new]
    strang = " ".join(new)
    return strang

#REMOVES IRRELEVANT POSTS, SAVES NEW DF TO FILE, INCORPORATES LEMMATIZER ABOVE. 
def build_and_lemmatize_text_list(df, lemmatize = True, save_string = "GME_df"):
    doc_list = []
    index_list = []
    print("Dateframe size prior to dropping stuff: " + str(len(df)))
    df.dropna(axis = 0, how = 'any', subset = ['selftext'], inplace = True)
    df = df[df["selftext"] != "[deleted]"]
    df = df[df['selftext'] != "[removed]"]
    df.drop_duplicates(subset = 'selftext', keep = "first", inplace = True)
    
    if lemmatize == True:
        
        df['selftext'] = df['selftext'].apply(lemmatize_breakup)
        df.to_csv(save_string, index = False)
        print("New " + save_string + ".csv file saved to directory.")
        
    return df

## 2.2 Vectorization of Word Documents

In [6]:
#Returns the sparse vectors 'v' and the feature names for the sparse vectors. 

def get_vectors(doc_list, max_features = 20000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 50, max_df = 0.90, ngram_range = (1,1), stop_words = 'english'):
    
    model = CountVectorizer(max_features = max_features, strip_accents = strip_accents, 
                            preprocessor = preprocessor, lowercase = lowercase, 
                            min_df = min_df, max_df = max_df, ngram_range = ngram_range, 
                            stop_words = stop_words)
    
    tf_matrix = model.fit_transform(doc_list)
    
    try:
        return {
            'matrix': tf_matrix, 
            'vectorizer': model, 
            'features': model.get_feature_names_out(), 
            'parameters': {'max_features': max_features, 
                           'min_df': min_df, 
                           'max_df': max_df, 
                           'ngram_range': ngram_range}}
    except:
        return {'matrix': tf_matrix, 'vectorizer': model, 'features': model.get_feature_names(), 
                'parameters': {'max_features': max_features, 
                           'min_df': min_df, 
                           'max_df': max_df, 
                           'ngram_range': ngram_range}}

## 2.3 Coherence

In [7]:
#DEFINITIONS

#Gets the 'topics' as input into the coherence algorithm. Is a nested list of top ten terms for each topic.
def get_top_terms(lda_model_components, tf_matrix_words, n_terms = 10):
    top_terms = []
    for index, term_weights in enumerate(lda_model_components):
        topic_top_terms = [tf_matrix_words[i] for i in term_weights.argsort()[::-1][:n_terms]]
#         print(term_weights.argsort()[::-1][:n_terms])
#         print(term_weights[term_weights.argsort()[::-1][:n_terms]])
#         print(term_weights.mean())
#         print(topic_top_terms)
        top_terms.append(topic_top_terms)       
    return top_terms  

#Gets specific dictionary object for use in GenSim - contains indexes of the words. 
def get_gensim_dict(tf_matrix_words):
    word_dict = Dictionary([tf_matrix_words])
    return word_dict
            
#Combines above functions with other measures to acquire coherence model. 
def cohe_score_func(estimator,
                    X,
                    feat_names,
                    #get_macro_score = False, 
                    n_terms = 10, 
                    coherence_type = "u_mass",
                    ):
    
    model_components = estimator.components_
    
    topics = get_top_terms(model_components, feat_names, n_terms = n_terms)
    
    dictionary = get_gensim_dict(feat_names)
    
    #Puts together a bag-of-words in the sparse format readable by GenSim. 
    corpus = Sparse2Corpus(X, documents_columns = False)
    
    cm = CoherenceModel(topics = topics, corpus = corpus, dictionary = dictionary, coherence = coherence_type)

    return cm.get_coherence(), cm.get_coherence_per_topic()

## 2.4 Cross Validation/Perplexity

In [8]:
#Custom score function for use in the cross validation scorer. 
def perp_score_func(estimator, X):
    perplexity = estimator.perplexity(X)
    return perplexity


#Get's the perplexity scores only without acquiring other info of the model folds. 
def lda_cross_val_perplexity(estimator, X, return_fold_scores = False):
    #perplexity = perp_score_func(estimator, X)
    scores = cross_val_score(estimator, X, scoring = perp_score_func)
    mean_score = scores.mean()
    if return_fold_scores == True:
        return (scores, mean_score)
    else:
        return mean_score
    
    
#Returns full stats including perplexity, but not including coherence. Returns estimator for coherence function. 
def lda_cross_val(X = None, feat_names = None, docs = None, return_fold_score = False, n_topics = 2):
    
    if X == None:
        try: 
            vectorizer = get_vectors(docs)
            X = vectorizer[0]
            feat_names = vectorizer[2]
        except:
            print("Need to input document list to acquire term frequency matrix ('doc_list' parameter).\
            \nAlternatively, can also enter matrix ('X' parameter) AND feature names ('feat_names' parameter) directly.")
    
    estimator = LatentDirichletAllocation(n_components = n_topics)
    
    scorer_dict = {"Perplexity": perp_score_func}
   
    stats = cross_validate(estimator, X, scoring = scorer_dict, return_estimator = True)

    return stats


#Gets coherence values for the estimators saved in the cross validation function above. 
def get_cross_val_coherence(estimator_list, matrix, feat_names, n_terms = 3):
    coherence_stats = []
    for est in estimator_list:
        cohe_, cohe_topics = cohe_score_func(est, matrix, feat_names, n_terms = n_terms)
        coherence_stats.append((cohe_, cohe_topics))
    return coherence_stats

## 2.5 Parameter Tuning

In [9]:
def get_parameters_df(p_d, record):
    #DETERMINE IF 'RECORD' ALEADY HAS RUNS LOGGED. DETERMINE RUN NUMBER.
    if np.isnan(record['run_num'].max()):
        run = 1
    else:
        run = record['run_num'].max() + 1
    #CREATE TESTING DATAFRAME
    df_cols = ['run_num', 'max_feat', 'min_df', 'max_df', 
               'ngram_range','n_components', 'random_state', 'top_terms', 'coherence_type']
    df = pd.DataFrame(columns = df_cols)
    for max_f in p_d['max_feat']:
        for mndf in p_d['min_df']:
            for mxdf in p_d['max_df']:
                for gram in p_d['ngram_range']:
                    for comp in p_d['n_components']:
                        for rs in p_d['random_state']:
                            for term in p_d['top_terms']:
                                df.loc[len(df.index)] = [run, max_f, mndf, mxdf, gram, 
                                                         comp, rs, term, p_d['coherence_type']]
    return df



def initialize_vectorizer(doc_input, initial_vectorizer):
    #DETERMINE IF INPUT IS A VECTORIZER OBJECT OR A DOCUMENT LIST. ASSIGN VARIABLES.
    #VECTORIZER OBJECT WILL COME FROM CUSTOM FUNCTION "get_vectorizer". 
    if type(doc_input) is list and type(doc_input[0]) is str:
        doc_list = doc_input
        vectorizer = initial_vectorizer
        try:
            existing_params = list(initial_vectorizer['parameters'].values())
            existing_features = initial_vectorizer['features']
            existing_vector = initial_vectorizer['matrix']
        except:
            print("There is an issue with the inputted initial vectorizer, or no vectorizer has been entered. \
A new vectorizer will be trained.")
            existing_params = []
            existing_features = None
            existing_vector = None
    elif type(doc_input) is dict and str(type(doc_input['matrix'])) == "<class 'scipy.sparse.csr.csr_matrix'>":
        vectorizer = doc_input
        existing_params = list(vectorizer['parameters'].values())
        existing_features = vectorizer['features']
        existing_vector = vectorizer['matrix']
        doc_list = None       
    else:
        print("Parameter 'doc_input' needs to be a vectorizer library or a document list.")
        return None
    
    return doc_list, existing_params, existing_features, existing_vector


def get_new_record_line(param_series, stats, coherence_stats): 
    summation = []
    for i in list(coherence_stats):
        summation.append(i[0])
    #Below is average coherence over the topics and over the folds. 
    cohe = np.array(summation).mean()

    perp = stats['test_Perplexity'].mean()
    cohe_folds = coherence_stats
    perp_folds = list(stats['test_Perplexity'])
    
    param_series['coherence'] = cohe
    param_series['perplexity'] = perp
    param_series['cohe_folds'] = cohe_folds
    param_series['perp_folds'] = perp_folds
    
    return param_series


def find_parameters(doc_input,
                    p_d,
                    #THE BELOW IS A LIBRARY!
                    initial_vectorizer = None, 
                    record_string = 'record.csv'):
    
    #OPENS RECORD CSV TO RECORD RESULTS
    record = pd.read_csv(record_string)
    #CREATE TESTING DATAFRAME
    df = get_parameters_df(p_d, record)

    #DETERMINE IF INPUT IS A VECTORIZER OBJECT OR A DOCUMENT LIST. ASSIGN VARIABLES. 
    doc_list, existing_params, existing_features, existing_vector = initialize_vectorizer(doc_input, initial_vectorizer)
    
    #RUN THROUGH TEST QUEUE.
    for i in list(df.index):
        tic = timeit.default_timer()
        param_series = df.loc[i]
        
        #CHECK TO SEE IF EXISTING VECTORIZER PARAMETERS NEED TO BE CHANGED. SKIP VECTORIZATION IF NOT.
        proposed_params = list(df.loc[i][['max_feat', 'min_df', 'max_df', 'ngram_range']])
        if proposed_params != existing_params:
            try:
                vectorizer = get_vectors(doc_list, 
                                         max_features = proposed_params[0], 
                                         min_df = proposed_params[1], 
                                         max_df = proposed_params[2], 
                                         ngram_range = proposed_params[3])
                existing_params = list(vectorizer['parameters'].values())
                existing_features = vectorizer['features']
                existing_vector = vectorizer['matrix']
            except:
                print("Function is trying to create a new term-frequency matrix, but does not have a document listed inputed.")
        
        #ADD NUMBER OF FEATURES TO SERIES (SO IT'S RECORDED IN THE SAVEFILE). 
        param_series['num_features'] = len(existing_features)
        
        #RUN CROSS VALIDATION LDA
        stats = lda_cross_val(X = existing_vector, 
                              feat_names = existing_features, 
                              docs = None, 
                              return_fold_score = False, 
                              n_topics = param_series['n_components'])
        
        #GET COHERENCE SCORES
        coherence_stats = get_cross_val_coherence(stats['estimator'], 
                                                   existing_vector, 
                                                   existing_features, 
                                                   n_terms = param_series['top_terms'])
        
        #CREATE NEW LINE IN RECORD DATAFRAME, AND SAVE TO DISK.
        param_series = get_new_record_line(param_series, stats, coherence_stats)
        record = record.append(param_series, ignore_index = True)
        record.to_csv(record_string, index = False)
        toc = timeit.default_timer()
        print("New record saved to " + record_string)
        print("Time to process: " + str((toc - tic)/60) + " minutes.")
    
    record.to_csv(('z_GME_backup/' + record_string), index = False)
    return record


In [11]:
pd.read_csv('GME_record.csv').tail()

Unnamed: 0,run_num,max_feat,min_df,max_df,ngram_range,n_components,random_state,top_terms,coherence_type,num_features,coherence,perplexity,cohe_folds,perp_folds
102,29,5000,25,0.9,"(1, 3)",12,,10,u_mass,5000,-1.313875,1269.082053,"[(-1.3826327438814143, [-1.130950762375651, -1...","[1261.9101954802672, 1271.790242408108, 1267.2..."
103,29,5000,25,0.9,"(1, 3)",15,,10,u_mass,5000,-1.428041,1263.52317,"[(-1.3544770449921797, [-0.6017384438101522, -...","[1236.6454153102898, 1265.9256360283305, 1267...."
104,29,5000,25,0.9,"(1, 3)",20,,10,u_mass,5000,-1.456773,1268.237848,"[(-1.4512366231497817, [-1.423856902013662, -1...","[1267.0852759802863, 1280.2341195701515, 1267...."
105,29,5000,25,0.9,"(1, 3)",30,,10,u_mass,5000,-1.631729,1287.696451,"[(-1.6646458271124536, [-1.6244254120172201, -...","[1275.2662086224757, 1289.2456306405647, 1286...."
106,29,5000,25,0.9,"(1, 3)",40,,10,u_mass,5000,-1.695101,1314.825297,"[(-1.7278349322838182, [-1.6245292305320893, -...","[1324.7586559714816, 1301.6655214716618, 1324...."


# 3.0 Clean and Preprocess Data

Inputs: 
    'folder_list' - list of all the subreddit folder to process to create a word list object.
    
Outputs:
    'doc_list' - list of lemmatized documents

Save Files:
    dataframe (.csv, name provided in function) - stored so a new doc_list can be made quickly

In [11]:
#List of folders from which to pull text data - each folder name represents a sub-reddit. 

folder_list = [
#                'finance',
#                'financialindependence',
#                'forex',
               'gme',
#                'investing',
#                'options',
#                'pennystocks',
#                'personalfinance',
#                'robinhood',
#                'robinhoodpennystocks',
#                'securityanalysis',
#                'stockmarket',
#                'stocks',
#                'wallstreetbets'
              ]

In [12]:
#CREATES DF OF PERTINENT POSTS AND LEMMATIZES EACH MAIN BODY OF TEXT. 
#WARNING! LONG RUN TIME! (APPX. 20 MINUTES FOR ALL 300,000+ TEXT FILES)

# tic = timeit.default_timer()

# df = build_and_simplify_dataframe(folder_list)

# toc1 = timeit.default_timer()
# print("DF created, time to process: " + str(toc1 - tic) + " seconds")

# df = build_and_lemmatize_text_list(df, save_string = "GME_df.csv")

# toc2 = timeit.default_timer()
# print("Lists created, time to process: " + str(toc2 - toc1))

# df

In [13]:
df = pd.read_csv('GME_df.csv')
df.head(5)

Unnamed: 0,id,title,selftext
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,after watch this i take a position right away ...
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,this guy explain exactly how to take a positio...
2,krnthg,ICR conference (11th Jan),any speculation or idea on what gamestop might...
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","after some downwards movement , i think everyb..."
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",guy ... we retard have fantasize a long time a...


In [14]:
#CREATES LIST OF TEXT POSTS FROM DATAFRAME

doc_list = list(df['selftext'])
del df

In [15]:
#TEST
print('Number of docs: ')
print(len(doc_list))
doc_list[0]

Number of docs: 
94039


'after watch this i take a position right away 🚀 http : //youtu.be/mjhs9yg8kfe'

# 4.0 Parameter Tuning via Cross Validation

In [16]:
p_d = {"max_feat": [20000], 
       "min_df": [25], 
       "max_df": [0.9], 
       "ngram_range": [(1, 3)], 
       "n_components": [30, 40], 
       "random_state": [None],
       "top_terms": [10],
       "coherence_type": "u_mass", 
      }

In [17]:
record = pd.read_csv('GME_record.csv')
# record.append(series)
record

Unnamed: 0,run_num,max_feat,min_df,max_df,ngram_range,n_components,random_state,top_terms,coherence_type,num_features,coherence,perplexity,cohe_folds,perp_folds
0,1,10000,25,0.9,"(1, 1)",3,,10,u_mass,8818,-1.172181,1528.690373,"[(-1.2391441707465813, [-1.363810344020599, -1...","[1483.379826238543, 1438.6850387721222, 1523.1..."
1,1,10000,25,0.9,"(1, 1)",10,,10,u_mass,8818,-1.385576,1482.933114,"[(-1.3321428129677502, [-1.8733632817399162, -...","[1435.1529394856345, 1436.2164481286475, 1461...."
2,1,10000,25,0.9,"(1, 1)",20,,10,u_mass,8818,-1.563672,1552.835436,"[(-1.5901819655386853, [-1.969168522668718, -1...","[1525.3552503362255, 1528.414094872189, 1545.5..."
3,1,10000,25,0.9,"(1, 1)",30,,10,u_mass,8818,-1.727125,1625.043304,"[(-1.6743555429386885, [-2.830421254356006, -1...","[1626.0165103640497, 1594.8041236529816, 1649...."
4,1,10000,25,0.9,"(1, 1)",40,,10,u_mass,8818,-1.815742,1736.758371,"[(-1.8387481941871464, [-1.3055692832481556, -...","[1739.6433707081894, 1720.7549714181073, 1764...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,29,5000,25,0.9,"(1, 3)",12,,10,u_mass,5000,-1.313875,1269.082053,"[(-1.3826327438814143, [-1.130950762375651, -1...","[1261.9101954802672, 1271.790242408108, 1267.2..."
103,29,5000,25,0.9,"(1, 3)",15,,10,u_mass,5000,-1.428041,1263.523170,"[(-1.3544770449921797, [-0.6017384438101522, -...","[1236.6454153102898, 1265.9256360283305, 1267...."
104,29,5000,25,0.9,"(1, 3)",20,,10,u_mass,5000,-1.456773,1268.237848,"[(-1.4512366231497817, [-1.423856902013662, -1...","[1267.0852759802863, 1280.2341195701515, 1267...."
105,29,5000,25,0.9,"(1, 3)",30,,10,u_mass,5000,-1.631729,1287.696451,"[(-1.6646458271124536, [-1.6244254120172201, -...","[1275.2662086224757, 1289.2456306405647, 1286...."


In [18]:
#find_parameters(doc_list, p_d, record_string = "GME_record.csv")

In [24]:
df = pd.read_csv('GME_record.csv')

In [24]:
df.sort_values('n_components')

Unnamed: 0,run_num,max_feat,min_df,max_df,ngram_range,n_components,random_state,top_terms,coherence_type,num_features,coherence,perplexity,cohe_folds,perp_folds
0,1,10000,25,0.9,"(1, 1)",3,,10,u_mass,8818,-1.172181,1528.690373,"[(-1.2391441707465813, [-1.363810344020599, -1...","[1483.379826238543, 1438.6850387721222, 1523.1..."
15,5,10000,25,0.9,"(1, 2)",4,,10,u_mass,10000,-1.05043,2103.727534,"[(-1.0391821176514395, [-1.646679446023103, -0...","[2096.4966468691937, 2032.5507206959644, 2074...."
9,3,10000,25,0.9,"(1, 1)",5,,10,u_mass,8818,-1.221677,1480.097254,"[(-1.4201418351231556, [-1.1389884083412343, -...","[1385.6762160773076, 1419.7191744450888, 1468...."
16,5,10000,25,0.9,"(1, 2)",6,,10,u_mass,10000,-1.084535,2048.527071,"[(-0.971993963923734, [-0.8140977698945425, -0...","[1990.5583623212665, 2016.3623644603804, 2037...."
10,3,10000,25,0.9,"(1, 1)",6,,10,u_mass,8818,-1.244094,1482.244673,"[(-1.2939134199560203, [-2.100858650614365, -1...","[1410.2733332440187, 1418.3230734357796, 1477...."
11,3,10000,25,0.9,"(1, 1)",7,,10,u_mass,8818,-1.309148,1473.185606,"[(-1.264279204904817, [-2.307885338629059, -1....","[1402.198435690982, 1420.1859106179938, 1455.4..."
17,5,10000,25,0.9,"(1, 2)",8,,10,u_mass,10000,-1.195237,2027.97765,"[(-1.0964963038761097, [-1.524741566720634, -1...","[1966.7154051411824, 2059.875452538132, 2013.6..."
5,2,10000,25,0.9,"(1, 1)",8,,10,u_mass,8818,-1.3207,1462.675878,"[(-1.3554903209090683, [-1.1485249662482713, -...","[1406.7482394160625, 1427.3518778450332, 1451...."
12,3,10000,25,0.9,"(1, 1)",9,,10,u_mass,8818,-1.347258,1465.17776,"[(-1.2611901433901975, [-0.5692034185424099, -...","[1398.3253012799523, 1429.2095769504313, 1445...."
6,2,10000,25,0.9,"(1, 1)",10,,10,u_mass,8818,-1.3561,1467.694896,"[(-1.3354496607017083, [-1.1872173627436728, -...","[1423.3965773847935, 1425.8957347701153, 1455...."


# 4.0 Term Frequency Vectors

Parameter tuning via Cross Validation showed that coherence scores were best when using unigrams, bigrams, and trigrams consecutively. Coherence of the top 3 topics generally improved as more topics were added, but the rate of improvement decreased at about 10 topics. Furthermore, the rate of coherence improvement of the top topic rapidly decreased at about 10 topics. Number of features did not seem to make much of a difference in regards to coherence. 

For perplexity, the higher n-grams seemed to increase the perplexity. Since trigrams were to be used, the appropriate number of features was chosen on the best fit perplexity plots with trigrams at a topic count of 10. This cooresponds to 10,000 and 20,000 features. 

In [18]:
#Runs the vector model to get a tf matrix and associated features. Also outputs Parameters. 
# vectorizer_20k = get_vectors(doc_list, max_features = 20000, min_df = 25, max_df = 0.9, ngram_range = (1, 3))
# vectorizer_10k = get_vectors(doc_list, max_features = 10000, min_df = 25, max_df = 0.9, ngram_range = (1, 3))

In [23]:
#SAVE NEW MODELS

# with open('vectorizer_10k.pkl', 'wb') as f:
#     pickle.dump(vectorizer_10k, f)
    
with open('vectorizer_20k.pkl', 'wb') as f:
    pickle.dump(vectorizer_20k, f)

In [24]:
#LOAD THE TWO VECTORIZER MODELS ALREADY TRAINED
# with open('vectorizer_10k.pkl', 'rb') as f:
#     vectorizer_10k = pickle.load(f)
    
# with open('vectorizer_20k.pkl', 'rb') as f:
#     vectorizer_20k = pickle.load(f)

In [25]:
print("Vectorizer 20K:")
print(vectorizer_20k['matrix'].shape)
print("Number of documents: ", vectorizer_20k['matrix'].shape[0])
print("Number of words: ", vectorizer_20k['matrix'].shape[1])
print("\n")
print("Vectorizer 10K:")
print(vectorizer_10k['matrix'].shape)
print("Number of documents: ", vectorizer_10k['matrix'].shape[0])
print("Number of words: ", vectorizer_10k['matrix'].shape[1])

Vectorizer 20K:
(94039, 20000)
Number of documents:  94039
Number of words:  20000


Vectorizer 10K:
(94039, 10000)
Number of documents:  94039
Number of words:  10000


In [26]:
#TEST
print(vectorizer_20k['matrix'][80, 0:1000])

  (0, 902)	1


In [27]:
#TEST
vectorizer_20k['features'][5000:5010]

['diligence http',
 'diligent',
 'dilute',
 'dilute share',
 'dilution',
 'dime',
 'dimension',
 'dimensional',
 'diminish',
 'ding']

In [28]:
#TEST
print(vectorizer_20k['parameters'])

{'max_features': 20000, 'min_df': 25, 'max_df': 0.9, 'ngram_range': (1, 3)}


# 5.0 Latent Dirichlet Allocation Model

Train and save two models (for 20k and 10k features). 

## 5.1 Feature Count = 10k, Topic Count = 10

In [21]:
#Train the LDA model on the 
#WARING: Long process time 12-20 minutes with 300,000+ documents.

# tic = timeit.default_timer()

# lda = LatentDirichletAllocation(n_components = 10, random_state = None)
# lda.fit(vectorizer_10k['matrix'])

# toc = timeit.default_timer()
# print(str((toc - tic)/60) + " minutes")

5.208473869956409 minutes


In [22]:
#SAVES PICKLE OF ABOVE MODEL

# with open('lda_model_10k.pkl', 'wb') as f:
#     pickle.dump(lda, f)

In [35]:
#OPENS SAVED MODEL FROM PICKLE

del lda
with open('lda_model_10k.pkl', 'rb') as f:
    lda_10k = pickle.load(f)

In [36]:
print(len(lda_10k.components_[0]))
lda_10k.components_.shape

10000


(10, 10000)

## 5.2 Feature Count = 20k, Topic Count = 10

In [29]:
#Train the LDA model on the 
#WARING: Long process time 12-20 minutes with 300,000+ documents.

# tic = timeit.default_timer()

# lda = LatentDirichletAllocation(n_components = 10, random_state = None)
# lda.fit(vectorizer_20k['matrix'])

# toc = timeit.default_timer()
# print(str((toc - tic)/60) + " minutes")

5.450025906460359 minutes


In [31]:
#SAVES PICKLE OF ABOVE MODEL

# with open('lda_model_20k.pkl', 'wb') as f:
#     pickle.dump(lda, f)

In [33]:
with open('lda_model_20k.pkl', 'rb') as f:
    lda_20k = pickle.load(f)

In [34]:
print(len(lda_20k.components_[0]))
lda_20k.components_.shape

20000


(10, 20000)

# 6.0 Coherence

## 6.1 Coherence on 10K Word Model

In [37]:
#TIMED COHERENCE SCORES ON CURRENT MODEL (PER TOPIC):
tic = timeit.default_timer()

cohe_scores_10k = cohe_score_func(lda_10k, vectorizer_10k['matrix'], vectorizer_10k['features'])

toc = timeit.default_timer()
print("Time to process: " + str((toc - tic)/60) + " minutes")

Time to process: 0.03731844290159643 minutes


In [38]:
cohe_scores_10k

(-1.3200397814214684,
 [-1.654842282318023,
  -1.0572889302152175,
  -0.6017384438101522,
  -0.9406684796870864,
  -1.997324013178544,
  -0.9958162531509893,
  -2.1135252474382025,
  -0.3050544912208139,
  -1.940829085303355,
  -1.5933105878923015])

## 6.2 Coherence on 20K Word Model

In [40]:
#TIMED COHERENCE SCORES ON CURRENT MODEL (PER TOPIC):
tic = timeit.default_timer()

cohe_scores_20k = cohe_score_func(lda_20k, vectorizer_20k['matrix'], vectorizer_20k['features'])

toc = timeit.default_timer()
print("Time to process: " + str((toc - tic)/60) + " minutes")

Time to process: 0.03995933444239199 minutes


In [41]:
cohe_scores_20k

(-1.1993934294426005,
 [-1.0741288237920654,
  -1.0976291344830973,
  -1.4576883592867196,
  -1.4113043091836077,
  -2.7087796206458137,
  -1.6760128266425274,
  -1.1237839061285002,
  -0.43078392972505714,
  -0.6266980540902165,
  -0.38712533044840103])

# 7.0 Topic Exploration

In [52]:
#CREATES DFs OF PERTINENT POSTS WITHOUT LEMMATIZATION FOR APPENDING TOPIC INFO. 

df = build_and_simplify_dataframe(['gme'])

df_10k = build_and_lemmatize_text_list(df, lemmatize = False, save_string = None)

df_20k = df_10k

del df

Dateframe size prior to dropping stuff: 273327


## 7.1 Topic Exploration on 10K Model

In [43]:
print(df_10k.shape)
df_10k.head()

(94039, 3)


Unnamed: 0,id,title,selftext
1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...
3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...
5,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...
7,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb..."
9,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...


In [44]:
doc_topic_mat = lda_10k.transform(vectorizer_10k['matrix'])

In [45]:
print(doc_topic_mat.shape)

(94039, 10)


In [47]:
##ADD NEW COLUMNS TO DATAFRAME WITH TOPIC AND SCORES.

# df_10k.reset_index(inplace = True)
# df_10k['prime_topic'] = None
# df_10k['sec_topic'] = None
# df_10k['prime_score'] = None
# df_10k['sec_score'] = None
# df_10k['topic_scores'] = None
# for i in df_10k.index:
#     df_10k['prime_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][0]
#     df_10k['sec_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][1]
#     df_10k['prime_score'][i] = np.sort(doc_topic_mat[i])[::-1][0]
#     df_10k['sec_score'][i] = np.sort(doc_topic_mat[i])[::-1][1]
#     df_10k['topic_scores'][i] = json.dumps(list(doc_topic_mat[i]))
# df_10k.head()

In [5]:
#df_10k.to_csv('df_10k.csv', index = False)
df_10k = pd.read_csv('df_10k.csv')
df_10k

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,6,Social Media Links (Many Reposts),5,0.493700,0.417392,"[0.011115580511175473, 0.011114949546913212, 0..."
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,6,Social Media Links (Many Reposts),9,0.877783,0.090211,"[0.004000481503935025, 0.004001206442958103, 0..."
2,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,8,News and Earnings Reports,3,0.579641,0.196707,"[0.0026319080133597192, 0.0026324819716669226,..."
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",3,"Opinions about Government, 'shills', troll pos...",6,0.459412,0.397844,"[0.004166976999316615, 0.004167743853592299, 0..."
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,8,News and Earnings Reports,1,0.854413,0.137253,"[0.001041753250742906, 0.13725262819231285, 0...."
...,...,...,...,...,...,...,...,...,...
94034,rt0gj1,REVERSE REPO = APES RETIREMENT,This is basically excess cash being deposited ...,9,Trading Rules and Regulations,3,0.359998,0.257014,"[0.04491219609047314, 0.14042660237504867, 0.0..."
94035,rt21tk,"Last of year purchase, another XX at limit ord...",Figured GME is going to close sub-$150 for the...,4,Shorting and Sharing Financial Data,1,0.345670,0.317468,"[0.002941803834197981, 0.31746832378339757, 0...."
94036,rt3e78,"Nancy Pelosee Posts Are This Weekend's FUD, Ch...","Disclaimer: I am smoother than skippy, this is...",3,"Opinions about Government, 'shills', troll pos...",0,0.884826,0.108772,"[0.10877236673539148, 0.0008002279861051584, 0..."
94037,rt4thl,What was the best day for GME hodlers in 2021?...,"Today. December 31, 2021. Because today prov...",1,"'Diamond Hands', Buy and Hold",8,0.442506,0.338910,"[0.0037039429779218334, 0.4425056825872468, 0...."


In [65]:
top_terms = get_top_terms(lda_10k.components_, vectorizer_10k['features'])

In [140]:
topic_num = 0
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['share', 'account', 'transfer', 'broke', 'fidelity', 'drs', 'robinhood', 'gme', 'use', 'trade'] 

TITLES:
0 .      SAXO - (FOP) transfer irregularities + SAXO GME share ownership (or lack off)
1 .      WEBULL to FIDELITY Transfer...PLEASE READ IF YOU ARE ON WEBULL AND CONSIDERING TRANSFERRING
2 .      If you have Robinhood, make sure you have a cash account!
3 .      For all you RH users procrastinating from switching to Fidelity due to FOMO of that MOASS
4 .      The DRS list. For those who wish to direct register, apes salute you!
5 .      Transferring to computershare via DRS out of Drivewealth or other that use the Drivewealth platform.
6 .      Jan Ape finally feeding the Bot, going through another $100+ drop has made me more pissed than ever !
7 .      Everyone talks about „I have time, I could wait another 10 years for MOASS“ but…
8 .      Voting Update for Canadian Apes 🇨🇦
9 .      CALL WEBULL TO VOTE!! They “Already Sent” Proxy Info

TEXTS:
TEXT  0 :
 On September

Unnamed: 0,level_0,index,id,title,selftext,prime_topic,sec_topic,prime_score,sec_score,topic_scores
91666,91666,255710,qb9eg9,SAXO - (FOP) transfer irregularities + SAXO GM...,On September 27th I requested to SAXO a transf...,0,3,0.997187,0.000313,"[0.997186796482052, 0.00031255984182120233, 0...."
82220,82220,195051,n6h25o,WEBULL to FIDELITY Transfer...PLEASE READ IF Y...,**PLEASE READ IF YOU ARE ON WEBULL AND CONSIDE...,0,1,0.997049,0.000328,"[0.997048560630531, 0.0003279755034862047, 0.0..."
4648,4648,13689,lh77lb,"If you have Robinhood, make sure you have a ca...","Robinhood will lend out your shares, if you ha...",0,4,0.996051,0.000439,"[0.9960513781968555, 0.0004387061214401881, 0...."
78202,78202,182005,mvx6d7,For all you RH users procrastinating from swit...,I have put together a little guide on my exper...,0,8,0.995774,0.00047,"[0.995773798648836, 0.0004696147803255549, 0.0..."
90165,90165,245313,pn0n52,The DRS list. For those who wish to direct reg...,Inspired by Apes I'm trying to get a list toge...,0,4,0.995287,0.000524,"[0.9952870772587568, 0.0005236736306835357, 0...."
91144,91144,251463,q0bypm,Transferring to computershare via DRS out of D...,I've seen a few posts asking if it's possible ...,0,5,0.994609,0.000599,"[0.9946094589079846, 0.0005989869929108661, 0...."
93559,93559,270006,rgbn47,"Jan Ape finally feeding the Bot, going through...",This is just a text post to make sure I'm coun...,0,4,0.994116,0.000654,"[0.9941164034418929, 0.0006537252360340678, 0...."
94018,94018,273178,rshxpx,"Everyone talks about „I have time, I could wai...",I don‘t want Kenny to bathe in wealth for anot...,0,8,0.994078,0.000658,"[0.9940783181949642, 0.0006580127674348, 0.000..."
81786,81786,193795,n5kb6r,Voting Update for Canadian Apes 🇨🇦,Hello fellow Canadian Apes!!\n\nI posted a few...,0,3,0.993706,0.000699,"[0.9937055319157078, 0.0006994149293015755, 0...."
80185,80185,188709,n155kp,CALL WEBULL TO VOTE!! They “Already Sent” Prox...,Making this post for awareness. I called Webul...,0,8,0.993477,0.000725,"[0.9934766399966268, 0.0007248536381742045, 0...."


In [112]:
topic_dict = {}
topic_dict[0] = "Trading Accounts"

In [190]:
topic_num = 1
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['ape', 'just', 'hold', 'buy', 'fuck', 'like', 'gme', 'know', 'make', 'want'] 

TITLES:
0 .      when hedgies & cnbc say sell!!!!!!!!
1 .      Fug you all MODS !!! GME TO THE MOON
2 .      Why the head fucks will never understand...
3 .      Ya betta hold, b*tch
4 .      Just got off the phone with E-Trade on hold for 2 hours. GameStop shares on the way to CS!!! Will post an update when they arrive
5 .      Anyone redoing lyrics to songs to fit GME?
6 .      I say Diamond Hands You say Hold!
7 .      Don’t turn into what we’re fighting against
8 .      Ladies and gents, u/oaf_king posted about how we can prepare ourselves to have diamond hands while we watch our portfolios rise to amounts of money we’ve never seen in our lives and it gave me an idea.
9 .      buy, hold, shrug, yawn, sigh, buy, hold, shrug, yawn, sigh

TEXTS:
TEXT  0 :
 🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊


Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
24334,m0sa2x,when hedgies & cnbc say sell!!!!!!!!,🙈🙉🙊\nbuy hold not financial advice\n🙈🙉🙊\nbuy h...,1,"'Diamond Hands', Buy and Hold",5,0.99795,0.000228,"[0.00022779900671706776, 0.997949834275542, 0...."
74575,mkpbtm,Fug you all MODS !!! GME TO THE MOON,GME to the moon 🚀 GME to the moon 🚀 GME to the...,1,"'Diamond Hands', Buy and Hold",4,0.997,0.000333,"[0.0003333585319572217, 0.99699972356817, 0.00..."
62895,me56kd,Why the head fucks will never understand...,Because they cannot put themselves in our shoe...,1,"'Diamond Hands', Buy and Hold",0,0.996896,0.000345,"[0.0003449180642589487, 0.9968963438250638, 0...."
47026,m8v9qi,"Ya betta hold, b*tch",You want a diamond body? \n\nYou want a Bugatt...,1,"'Diamond Hands', Buy and Hold",9,0.996885,0.000346,"[0.0003460548598716875, 0.9968853128828876, 0...."
93222,rbbhoy,Just got off the phone with E-Trade on hold fo...,Diamond hands Diamond hands Diamond hands Diam...,1,"'Diamond Hands', Buy and Hold",5,0.996853,0.00035,"[0.0003496504772256189, 0.9968531303868823, 0...."
64019,meq5w9,Anyone redoing lyrics to songs to fit GME?,So I've recently found myself listening more c...,1,"'Diamond Hands', Buy and Hold",8,0.996564,0.000382,"[0.00038172775149846334, 0.9965641005654201, 0..."
11561,lrn7x7,I say Diamond Hands You say Hold!,DIAMOND HANDS! HOLD!!!! DIAMOND HANDS! HOLD!!!...,1,"'Diamond Hands', Buy and Hold",5,0.996471,0.000392,"[0.00039216042168949686, 0.9964705510829844, 0..."
43879,m82u08,Don’t turn into what we’re fighting against,I asked by sister what she’s gonna do with the...,1,"'Diamond Hands', Buy and Hold",8,0.995774,0.00047,"[0.00046956907896941486, 0.9957737388111695, 0..."
23627,m055lp,"Ladies and gents, u/oaf_king posted about how ...",One part of his post really stood out to me. H...,1,"'Diamond Hands', Buy and Hold",3,0.995754,0.000472,"[0.0004717989275821544, 0.995753916807501, 0.0..."
39516,m6dmsm,"buy, hold, shrug, yawn, sigh, buy, hold, shrug...","buy, hold, shrug, yawn, sigh, buy, hold, shrug...",1,"'Diamond Hands', Buy and Hold",5,0.995544,0.000495,"[0.0004950699325340571, 0.9955444689688675, 0...."


In [122]:
topic_dict[1] = "'Diamond Hands', Buy and Hold"

In [192]:
topic_num = 2
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['com', 'http', 'reddit', 'reddit com', 'www', 'http www', 'www reddit', 'www reddit com', 'http www reddit', 'comments'] 

TITLES:
0 .      My Honest Thoughts on GME
1 .      r/GME Megathread for Sunday - August 15, 2021
2 .      r/GME Megathread for Saturday - August 14, 2021
3 .      r/GME Megathread for Sunday - July 25, 2021
4 .      r/GME Megathread for Friday - August 13, 2021
5 .      r/GME Megathread for Monday - July 26, 2021
6 .      r/GME Megathread for Wednesday - July 28, 2021
7 .      drives me crazy
8 .      r/GME Megathread for Tuesday - May 25, 2021
9 .      r/GME Megathread for Thursday - September 23, 2021

TEXTS:
TEXT  0 :
 I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
63759,melyvr,My Honest Thoughts on GME,I like the stock.\nI like the stock.\nI like t...,2,Reddit MOD Announcement,1,0.999896,1.2e-05,"[1.157723200325687e-05, 1.157856146668522e-05,..."
89298,p4o20e,"r/GME Megathread for Sunday - August 15, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.99905,0.000106,"[0.00010552120978018204, 0.0001055028322966220..."
89275,p42jtt,"r/GME Megathread for Saturday - August 14, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.99905,0.000106,"[0.00010552120978018204, 0.0001055028322966220..."
88561,or64qi,"r/GME Megathread for Sunday - July 25, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,9,0.998953,0.000116,"[0.00011631150167035335, 0.0001163021507138491..."
89245,p3hke8,"r/GME Megathread for Friday - August 13, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,3,0.998917,0.00012,"[0.00012036383730022555, 0.0001203574202903596..."
88589,orsoft,"r/GME Megathread for Monday - July 26, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.998909,0.000121,"[0.00012127401555610394, 0.0001212328343481597..."
88643,ot400q,"r/GME Megathread for Wednesday - July 28, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.998898,0.000122,"[0.00012245874351186385, 0.0001224150558357531..."
9018,lob98y,drives me crazy,I have been trying to draw attention to the fa...,2,Reddit MOD Announcement,0,0.998723,0.000142,"[0.00014188164458064874, 0.0001418622441493064..."
84419,nkikkc,"r/GME Megathread for Tuesday - May 25, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,8,0.99856,0.00016,"[0.00016002143636936126, 0.0001600383470246453..."
90646,ptoude,"r/GME Megathread for Thursday - September 23, ...","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.998555,0.000161,"[0.00016056737365144515, 0.0001605290251453965..."


In [113]:
topic_dict[2] = "Reddit MOD Announcement"

In [194]:
topic_num = 3
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['post', 'people', 'know', 'ape', 'just', 'make', 'like', 'think', 'dd', 'gme'] 

TITLES:
0 .      DD (A.K.A Due Dilligence) information on how to better recognize bot/shill/troll comments and intentions.
1 .      I'm Stepping Down As Moderator & More
2 .      We must learn to encourage our politicians to do the right thing and stand behind them!
3 .      Can we please limit use of the word SHILL to actual shills?
4 .      Please take a moment to read this post. It might just make r/GME an even better place for all!
5 .      Shills have been adapting right in front of our faces and it seems many haven't yet caught on.
6 .      Proposal - PLAN OF COMBAT - SHILLS AND SH*T POSTS
7 .      Guide to attacking a subreddit community
8 .      My Message
9 .      Discussion on Government Intervention

TEXTS:
TEXT  0 :
 Howdy everyone,

I'm sure I don't have to be the one to tell you that we have seen a very large uptick in manipulative activity after Friday. Once again it seems like 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
9100,lofp6k,DD (A.K.A Due Dilligence) information on how t...,"Howdy everyone,\n\nI'm sure I don't have to be...",3,"Opinions about Government, 'shills', troll pos...",0,0.998828,0.00013,"[0.00013024818447640406, 0.0001302390355845810..."
77323,msgp0z,I'm Stepping Down As Moderator & More,I joined GME in January like most of us after ...,3,"Opinions about Government, 'shills', troll pos...",9,0.998582,0.000158,"[0.00015752878589032963, 0.0001575166213045705..."
82192,n6fcab,We must learn to encourage our politicians to ...,LISTEN! For decades our politicians have been...,3,"Opinions about Government, 'shills', troll pos...",0,0.998398,0.000178,"[0.00017802817702353117, 0.0001779745426889831..."
66732,mgcsbx,Can we please limit use of the word SHILL to a...,TL;DR not everyone you disagree with is a shil...,3,"Opinions about Government, 'shills', troll pos...",5,0.998182,0.000202,"[0.00020208002393807506, 0.0002020625900147287..."
65726,mftlgt,Please take a moment to read this post. It mig...,Greetings apes! 🍌\n\nI'd like to start by sayi...,3,"Opinions about Government, 'shills', troll pos...",6,0.997897,0.000234,"[0.0002336972453290241, 0.00023370776056136607..."
19423,lwh29u,Shills have been adapting right in front of ou...,"Hello again everyone,\n\nI honestly thought th...",3,"Opinions about Government, 'shills', troll pos...",1,0.997841,0.00024,"[0.00023985552398235398, 0.0002398767810494489..."
66824,mgeh52,Proposal - PLAN OF COMBAT - SHILLS AND SH*T POSTS,"Dear Fellow Apes,\n\nFirst, I would like to st...",3,"Opinions about Government, 'shills', troll pos...",0,0.997457,0.000283,"[0.0002826436657128661, 0.00028259702257585817..."
75420,mm3c0p,Guide to attacking a subreddit community,# Technique #1 - 'TOPIC DILUTION'\n\n**Aim:**...,3,"Opinions about Government, 'shills', troll pos...",0,0.997289,0.000301,"[0.00030133298182698817, 0.0003012712590386408..."
75329,mlgofd,My Message,Hello everyone.\n\nThis is to clarify my perso...,3,"Opinions about Government, 'shills', troll pos...",1,0.997272,0.000303,"[0.0003030890802241014, 0.00030312837169076096..."
88273,omzvsf,Discussion on Government Intervention,"First of all, I am not a shill. I want this bo...",3,"Opinions about Government, 'shills', troll pos...",9,0.996928,0.000341,"[0.0003413874536410046, 0.0003413604194774321,..."


In [114]:
topic_dict[3] = "Opinions about Government, 'shills', troll posts, etc."

In [196]:
topic_num = 4
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['gme', 'share', 'short', 'volume', '000', 'day', 'etf', '2021', 'data', '10'] 

TITLES:
0 .      Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n kJoin Su p er s to n k
1 .      7/14 After hours after math
2 .      Shortable Stock Availability 3/15
3 .      JACKED TO THE TITS? Relieve some stress with bubble wrap
4 .      3-16-2021 - My wild stab at Short Sells
5 .      A GME FTD Price Model based on T+35
6 .      3-18-2021 -- Possible Pressure for the Big Squeeze
7 .      The short sale volume percent (not short interest) for GME is 66% on Aug 16, 2021 🦍💪🚀💎🙌
8 .      GME and ETF's that contain GME short volume through 3/1/2021.
9 .      Alright Apes make of this info as you will

TEXTS:
TEXT  0 :
 Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n kJoin Su p er s to n k J

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
74820,mktn6a,Join Su p er s to n k Join Su p er s to n k Jo...,Join Su p er s to n k Join Su p er s to n k Jo...,4,Shorting and Sharing Financial Data,3,0.999742,2.9e-05,"[2.8719171316163066e-05, 2.8719417656266848e-0..."
88178,olufb9,7/14 After hours after math,"Hi All, trying to gain a wrinkle here. feel so...",4,Shorting and Sharing Financial Data,1,0.999185,9.1e-05,"[9.058979278362576e-05, 9.063561299762814e-05,..."
38000,m5zs92,Shortable Stock Availability 3/15,"Hi everyone, after today's action in both GME ...",4,Shorting and Sharing Financial Data,0,0.998611,0.000154,"[0.0001544016927238291, 0.00015434011502002203..."
69420,mhuvt0,JACKED TO THE TITS? Relieve some stress with b...,HOW JACKED ARE YOU?\n\n>!pop!< >!pop!< >!pop!<...,4,Shorting and Sharing Financial Data,1,0.998594,0.000156,"[0.00015625081340502528, 0.0001562530897168915..."
40029,m6kb1o,3-16-2021 - My wild stab at Short Sells,"Not sure if this helps anyone, but this does k...",4,Shorting and Sharing Financial Data,0,0.997398,0.000289,"[0.00028912966374745875, 0.0002890760742125976..."
87655,oeqj2n,A GME FTD Price Model based on T+35,**TL;DR**\n\n&nbsp;\n\nFTDs will deliver the t...,4,Shorting and Sharing Financial Data,7,0.996559,0.00311,"[4.136008629031401e-05, 4.1362181745610106e-05..."
43910,m838os,3-18-2021 -- Possible Pressure for the Big Squ...,Yep. Ran the app again. Here are today's numbe...,4,Shorting and Sharing Financial Data,6,0.995813,0.000465,"[0.00046523249427634386, 0.0004651634690636653..."
89352,p5usob,The short sale volume percent (not short inter...,The short sale volume percent (not short inter...,4,Shorting and Sharing Financial Data,5,0.995755,0.000472,"[0.0004717067832894187, 0.0004717060994764702,..."
18498,lvqy8b,GME and ETF's that contain GME short volume th...,"Apes, \n\n\nHope you had a great start to Mar...",4,Shorting and Sharing Financial Data,2,0.995544,0.000495,"[0.0004951142671282871, 0.0004951579111292803,..."
72589,mk4htj,Alright Apes make of this info as you will,So here are the totals for calls and puts prov...,4,Shorting and Sharing Financial Data,1,0.995476,0.000503,"[0.0005026666960183959, 0.0005027114206634459,..."


In [115]:
topic_dict[4] = "Shorting and Sharing Financial Data"

In [198]:
topic_num = 5
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['share', 'price', 'short', 'sell', 'buy', 'stock', 'gme', 'squeeze', 'market', 'just'] 

TITLES:
0 .      BUY STOCKS, NOT OPTIONS
1 .      Correct me if I'm wrong, but I think you're just as likely to get $1B for your shares as you are to get $10K
2 .      Stay relax; there is no way that sudden dip not from short-selling.
3 .      Covering the real shorts to fake low SI
4 .      Retarded Ape helping fellow apes understand options
5 .      Alternate outcome to the 3/19 DD
6 .      AH Price Action Explained
7 .      ⛔ IMPORTANT ⛔ ABOUT SELLING ORDER TYPE
8 .      ELI5: How does a stock get bought back "multiple times"? How could the price go to 1 million?
9 .      What happens when there's a crazy imbalance in bid/ask?

TEXTS:
TEXT  0 :
 Before I start, I want to say that this is not financial advice and im not any kind of advisor. 

This is purely what I will do:

&#x200B;

BUY STOCKS, NOT OPTIONS BUY STOCKS, NOT OPTIONS BUY STOCKS, NOT OPTIONS BUY STOCKS, NOT OPTIONS BUY 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
48295,m9k0mw,"BUY STOCKS, NOT OPTIONS","Before I start, I want to say that this is not...",5,General Advice/Questions about Stocks,1,0.99895,0.000117,"[0.0001166936979066522, 0.00011669922455722537..."
47746,m9al7b,"Correct me if I'm wrong, but I think you're ju...","**Obviously not financial advice, I'm pretty d...",5,General Advice/Questions about Stocks,3,0.997058,0.000327,"[0.0003268582101789805, 0.0003268865819613594,..."
37614,m5uei5,Stay relax; there is no way that sudden dip no...,"Not financial advice, only personal thought on...",5,General Advice/Questions about Stocks,1,0.996808,0.000355,"[0.000354642017525354, 0.0003547585463873848, ..."
32908,m3mnjp,Covering the real shorts to fake low SI,"I am not a ceasar-intellect level ape, but the...",5,General Advice/Questions about Stocks,4,0.995927,0.000453,"[0.0004525542047425369, 0.00045257204873899563..."
8923,lo5z3p,Retarded Ape helping fellow apes understand op...,"So here to a fun week, but I'd like to set the...",5,General Advice/Questions about Stocks,3,0.995693,0.000479,"[0.000478581407461485, 0.0004786356907423595, ..."
17361,lupbze,Alternate outcome to the 3/19 DD,There's an alternative that doesn't involve a ...,5,General Advice/Questions about Stocks,4,0.995609,0.000488,"[0.00048788024042523283, 0.0004879037556376666..."
53129,mboq01,AH Price Action Explained,Not a financial advisor. Do your own due dili...,5,General Advice/Questions about Stocks,1,0.995287,0.000524,"[0.000523652531126545, 0.000523688253456997, 0..."
34135,m49pmv,⛔ IMPORTANT ⛔ ABOUT SELLING ORDER TYPE,I DON'T CARE WHETHER YOU SELL AT 100K OR 500K ...,5,General Advice/Questions about Stocks,4,0.995212,0.000532,"[0.0005320162092596363, 0.0005320098542185532,..."
63411,meg4r9,"ELI5: How does a stock get bought back ""multip...",Let's say that hedge funds are on the hook for...,5,General Advice/Questions about Stocks,1,0.994971,0.000559,"[0.0005587689941135265, 0.0005588393259574535,..."
52322,mbhyho,What happens when there's a crazy imbalance in...,"Okay, so something I'm curious about in unders...",5,General Advice/Questions about Stocks,9,0.994971,0.000559,"[0.0005587835822653561, 0.000558855358635093, ..."


In [116]:
topic_dict[5] = "General Advice/Questions about Stocks"

In [200]:
topic_num = 6
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['http', 'com', 'www', 'http www', 'gamestop', 'html', 'twitter', 'video', 'twitter com', 'youtube'] 

TITLES:
0 .      My totally real positions for the algorithm
1 .      List of all GameStop social pages - One day to market open, time to jump on those social media pages and rate those mobile apps!
2 .      Bored on Saturday with market closed? Pump those social GameStop & Family channels while you wait to get them tendies!
3 .      While waiting for the market to open, make sure to drop into GameStop social channels and leave them likes, comments and buy some merchandise! Links for all the social pages inside
4 .      One day till we're back in game - till then, you can boost GameStop social pages, listed inside!
5 .      It's weekend! Time to boost these GameStop social medias while you wait for Monday to come 🚀
6 .      Great day today! Reminder to drop into GameStop social channels
7 .      WHAT EVERYONE NEEDS TO DO RIGHT NOW, full social media list.
8 .      Support 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
50611,matoph,My totally real positions for the algorithm,"69,420@6969 in $CUM 12345@4958 in $FUK 19479@9...",6,Social Media Links (Many Reposts),4,0.999552,5e-05,"[4.9751259707176797e-05, 4.975203662759475e-05..."
64577,mf45mc,List of all GameStop social pages - One day to...,\n\nWill re-post this once in a while so that...,6,Social Media Links (Many Reposts),8,0.998788,0.000135,"[0.00013462344849898069, 0.0001346167635409512..."
63609,mejo5l,Bored on Saturday with market closed? Pump tho...,Will re-post this once in a while so that all ...,6,Social Media Links (Many Reposts),8,0.998788,0.000135,"[0.00013462344849898069, 0.0001346167635409512..."
36154,m5i2i1,"While waiting for the market to open, make sur...",Will re-post this once in a while so that all...,6,Social Media Links (Many Reposts),0,0.998661,0.000149,"[0.0001488486131805043, 0.00014883714837285174..."
35316,m5052p,"One day till we're back in game - till then, y...",Will re-post this once in a while so that all ...,6,Social Media Links (Many Reposts),0,0.998661,0.000149,"[0.0001488486131805043, 0.00014883714837285174..."
33905,m447w5,It's weekend! Time to boost these GameStop soc...,**The numbers barely moved / apps actually wen...,6,Social Media Links (Many Reposts),0,0.998658,0.000149,"[0.00014908626171832558, 0.0001490711064033014..."
29825,m2bb24,Great day today! Reminder to drop into GameSto...,**The numbers barely moved / apps actually wen...,6,Social Media Links (Many Reposts),0,0.998648,0.00015,"[0.0001502050462391299, 0.00015018971460856927..."
62642,me0f6q,"WHAT EVERYONE NEEDS TO DO RIGHT NOW, full soci...",This list was put together by u/Rabus I am jus...,6,Social Media Links (Many Reposts),0,0.998622,0.000153,"[0.00015318719935441283, 0.0001531750131048230..."
26100,m1k8e3,Support GameStop - drop in their Social media ...,Will re-post this once in a while so that all ...,6,Social Media Links (Many Reposts),0,0.998621,0.000153,"[0.00015320311179929533, 0.0001531772960722894..."
24780,m14qbj,GameStop Social media list / apps,"I see these popping up every now and then, but...",6,Social Media Links (Many Reposts),0,0.998105,0.000211,"[0.00021060252923848512, 0.0002105650636896353..."


In [117]:
topic_dict[6] = "Social Media Links (Many Reposts)"

In [202]:
topic_num = 7
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['png', 'x200b', 'http', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'preview redd'] 

TITLES:
0 .      SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR
1 .      Taken from twitter! DD on the crash yesterday and the coming days/weeks/months
2 .      HOLY SMOKES HOLY MOLY
3 .      When GME hit 100k can we all go to Citadel & Merlin and drink champagne, like these guys.
4 .      PICTURES!!! 💎🙌 ARE NOT SELLING! During the last 10 minutes of the trading day on Thursday, March 4th, this was happening. The matching buy and sell increments shows us that WE are not selling. The price is 100% PSYCHOLOGICAL! 45, 46, 47, 51, 100, 300...this is irregular. This is to drop the price.
5 .      South Korea) I was with you even when it was 40 dollars.
6 .      GME BT DUMP - 3/29
7 .      Look at this cute little thing...
8 .      GME Large Bull Fla

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
30750,m2p7ya,SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SS...,SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SS...,7,Posts of Memes/Images,4,0.998048,0.000217,"[0.00021691973969952176, 0.0002169230798379255..."
59010,md01x9,Taken from twitter! DD on the crash yesterday ...,"not my work, not financial advice, this ape li...",7,Posts of Memes/Images,1,0.996691,0.000368,"[0.0003676585733030681, 0.000367716809812659, ..."
92084,qkijx3,HOLY SMOKES HOLY MOLY,HOLY SMOKES HOLY MOLY HOLY SMOKES HOLY MOLY HO...,7,Posts of Memes/Images,1,0.99313,0.000763,"[0.0007633612660886593, 0.0007634824716446396,..."
24830,m15e9x,When GME hit 100k can we all go to Citadel & M...,&#x200B;\n\nhttps://preview.redd.it/bayo9fwswz...,7,Posts of Memes/Images,1,0.992857,0.000794,"[0.0007936692678393034, 0.0007939562190377511,..."
21722,ly7j3x,PICTURES!!! 💎🙌 ARE NOT SELLING! During the las...,**EDIT: Simplified.**\n\n**🦍🦍🦍's are💎🙌ing the ...,7,Posts of Memes/Images,1,0.992622,0.00082,"[0.0008196979911496042, 0.0008198998229979559,..."
79461,mze7el,South Korea) I was with you even when it was 4...,&#x200B;\n\nhttps://preview.redd.it/d4bpsuialm...,7,Posts of Memes/Images,1,0.992622,0.00082,"[0.0008197418173753819, 0.0008198537506973497,..."
66429,mg5rcq,GME BT DUMP - 3/29,Daily GME BT DUMP. Let me know if you want an...,7,Posts of Memes/Images,3,0.992562,0.000827,"[0.0008264952692626177, 0.0008265084914317405,..."
78601,mwxpbw,Look at this cute little thing...,&#x200B;\n\nhttps://preview.redd.it/kcom89nnxx...,7,Posts of Memes/Images,3,0.992561,0.000827,"[0.0008265030047434702, 0.0008266104560367176,..."
67241,mglc67,GME Large Bull Flag - Possible breakout,Just noticed there was a large bull flag and w...,7,Posts of Memes/Images,3,0.991587,0.000935,"[0.0009347249402753098, 0.0009347545265282976,..."
37197,m5rw2w,Today´s Price Action of GME explained in Memes...,**GME Opening Price: $277.52 / 221.30€**\n\nh...,7,Posts of Memes/Images,2,0.990898,0.005415,"[0.00046083690197170913, 0.0004608356109373631..."


In [118]:
topic_dict[7] = "Posts of Memes/Images"

In [204]:
topic_num = 8
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['gamestop', 'company', 'game', 'cohen', 'year', 'new', 'news', 'board', 'vote', 'ryan'] 

TITLES:
0 .      4 Additional Board Members Expected to leave in June. Possible additional changes to senior executives.
1 .      GameStop Releases 2021 Proxy Statement
2 .      GAMESTOP NEWS RELEASE
3 .      GameStop Appoints Chief Growth Officer Announces Two Additional Executive Hires to Support Transformation
4 .      GameStop appoints Chief Growth Officer Elliot Wilke 30th Mar 2021 plus 2 VPs 🚀🚀
5 .      RYAN COHEN FOR BOARD DIRECTOR! 9TH JUNE ANNUAL MEETING!
6 .      Earnings up on Gamestop Website - NEW ROCKSTAR COO!!!!
7 .      GameStop Provides Corporate Governance Update
8 .      GameStop CEO George Sherman: “Goal: Leading Global Omni-Channel Retailer For All Things Gaming and Entertainment” - March 11, 2021
9 .      GME Q2 Earnings Report

TEXTS:
TEXT  0 :
 [Buried in the 10-K that Gamestop released today, under ITEM 9B Other Information](https://news.gamestop.com/node/1866

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
53849,mbrjug,4 Additional Board Members Expected to leave i...,[Buried in the 10-K that Gamestop released tod...,8,News and Earnings Reports,4,0.997281,0.000302,"[0.00030214216530629294, 0.0003021353628825133..."
78487,mwok26,GameStop Releases 2021 Proxy Statement,GameStop has released their anticipated **2021...,8,News and Earnings Reports,4,0.996326,0.000408,"[0.00040825506513390803, 0.0004082122736642851..."
10625,lqtr5c,GAMESTOP NEWS RELEASE,[https://news.gamestop.com/news-releases/news-...,8,News and Earnings Reports,0,0.995336,0.000518,"[0.0005182607687988424, 0.0005181909883209347,..."
66749,mgd6n4,GameStop Appoints Chief Growth Officer Announc...,Thought I'd share...\n\nGLOBENEWSWIRE 3:45 AM ...,8,News and Earnings Reports,4,0.995134,0.000541,"[0.000540604481157613, 0.0005406445377260532, ..."
66821,mgeggn,GameStop appoints Chief Growth Officer Elliot ...,https://news.gamestop.com/news-releases/news-r...,8,News and Earnings Reports,1,0.994857,0.000572,"[0.0005714711943696559, 0.0005715205213696621,..."
75554,mmprm9,RYAN COHEN FOR BOARD DIRECTOR! 9TH JUNE ANNUAL...,Press release from GameStop Website:\n\n>GRAPE...,8,News and Earnings Reports,1,0.994797,0.000578,"[0.0005780685169041128, 0.0005781880871137295,..."
53060,mboebt,Earnings up on Gamestop Website - NEW ROCKSTAR...,Q4 EPS 1.34 Adjusted\n\nFY2020 EPS (-)2.14 Adj...,8,News and Earnings Reports,6,0.994512,0.00061,"[0.00060976876218356, 0.000609771630302634, 0...."
23785,m0fbta,GameStop Provides Corporate Governance Update,https://gamestop.gcs-web.com/news-releases/new...,8,News and Earnings Reports,6,0.994267,0.000637,"[0.0006369783447214565, 0.0006369983286139198,..."
35035,m4v9lt,GameStop CEO George Sherman: “Goal: Leading Gl...,"\n[CEO’s of GameStop, FansUnite, ESE and Draft...",8,News and Earnings Reports,2,0.993568,0.000716,"[0.0007144421373019484, 0.0007144363168906168,..."
90044,pkiupk,GME Q2 Earnings Report,https://investor.gamestop.com/news-releases/ne...,8,News and Earnings Reports,9,0.992306,0.000855,"[0.0008548626224087271, 0.0008548079534363822,..."


In [119]:
topic_dict[8] = "News and Earnings Reports"

In [206]:
topic_num = 9
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['http', 'market', 'www', 'http www', 'short', 'sec', 'com', 'trade', 'fund', 'security'] 

TITLES:
0 .      It doesn't appear that DTCC has "insurance" (at least in the way we think of it); it has a "loss allocation waterfall"
1 .      Not on the moon, past the moon?
2 .      Self-Regulatory Organizations; Proposed Rule Changes: National Securities Clearing Corp.
3 .      Legality of Turning off the Buy Button
4 .      NSSC-002 and NSSC-801 Update
5 .      Today's hot shit 7/14: New rules SR-DTC-2021-013, SR-DTC-2021-011, SR-DTC-2021-010
6 .      Love it when new SEC filings have to update their Risk and Recover plan during these times
7 .      Ongoing Continuation of DTC’s “Prepare for the worst”, Recovery & Wind-down (R&W) Amendments.
8 .      Revise the Clearing Agency Investment Policy - New Rulings on DTCC?
9 .      New DTC-2021-003 - Can someone ELI5 it ?

TEXTS:
TEXT  0 :
 **TL;DR:** Someone who buys and hodls 💎🤲 *will* get the tendies, from *someone*, if the stock 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
83637,ndytmh,"It doesn't appear that DTCC has ""insurance"" (a...",**TL;DR:** Someone who buys and hodls 💎🤲 *will...,9,Trading Rules and Regulations,0,0.999209,8.8e-05,"[8.791106743943505e-05, 8.789746867588811e-05,..."
52455,mbjeh2,"Not on the moon, past the moon?",Yeah? Yeah? Yeah? Yeah? Yeah? Yeah? Yeah? Yeah...,9,Trading Rules and Regulations,1,0.998831,0.00013,"[0.00012987158159667668, 0.0001298814364913912..."
54535,mbyfw6,Self-Regulatory Organizations; Proposed Rule C...,"Digging around the internet, and found this do...",9,Trading Rules and Regulations,4,0.998797,0.000134,"[0.00013372003142883996, 0.0001337104509609044..."
78267,mw5lhs,Legality of Turning off the Buy Button,**Disclaimer**: This is a post to attempt to i...,9,Trading Rules and Regulations,3,0.997935,0.000229,"[0.00022940716431495728, 0.0002294092130204513..."
52196,mbgn48,NSSC-002 and NSSC-801 Update,I did not see a new post yet on this. \n\nThe ...,9,Trading Rules and Regulations,0,0.99736,0.000293,"[0.0002933878860356113, 0.0002933472107628876,..."
87965,ojxw28,Today's hot shit 7/14: New rules SR-DTC-2021-0...,"[SR-DTC-2021-013, Notice of Filing and Immedia...",9,Trading Rules and Regulations,8,0.996808,0.000355,"[0.0003546452664825956, 0.0003546226234554297,..."
54553,mbyrgm,Love it when new SEC filings have to update th...,Seems like there were some updates to the Reco...,9,Trading Rules and Regulations,1,0.996414,0.000399,"[0.0003984868725969722, 0.00039858352541691136..."
55337,mc76fz,Ongoing Continuation of DTC’s “Prepare for the...,Newly submitted rule filings for larger brains...,9,Trading Rules and Regulations,6,0.996,0.000445,"[0.00044449095235565134, 0.0004444535298959044..."
24246,m0qa9d,Revise the Clearing Agency Investment Policy -...,[https://www.dtcc.com/legal/sec-rule-filings.a...,9,Trading Rules and Regulations,4,0.995714,0.000476,"[0.0004762976917749859, 0.0004762360814171751,..."
40983,m71xnn,New DTC-2021-003 - Can someone ELI5 it ?,Link :[https://www.dtcc.com/legal/sec-rule-fil...,9,Trading Rules and Regulations,6,0.995287,0.000524,"[0.0005236326080836791, 0.0005236071491312681,..."


In [120]:
topic_dict[9] = "Trading Rules and Regulations"

In [123]:
topic_dict

{0: 'Trading Accounts',
 2: 'Reddit MOD Announcement',
 3: "Opinions about Government, 'shills', troll posts, etc.",
 4: 'Shorting and Sharing Financial Data',
 5: 'General Advice/Questions about Stocks',
 6: 'Social Media Links (Many Reposts)',
 7: 'Posts of Memes/Images',
 8: 'News and Earnings Reports',
 9: 'Trading Rules and Regulations',
 1: "'Diamond Hands', Buy and Hold"}

In [220]:
for i, key in enumerate(topic_dict.keys()):
    top = topic_dict[key]
    cohe = cohe_scores_10k[1][i]
    avg = df_10k[df_10k['prime_topic'] == key]['prime_score'].mean()
    count = df_10k[df_10k['prime_topic'] == key]['prime_score'].count()
    print("Topic ", key, " is assigned to ", count, " documents - ", top)
    print("Avg. probability: ", avg, ", Coherence: ", cohe, "\n")

Topic  0  is assigned to  6391  documents -  Trading Accounts
Avg. probability:  0.6752539513960284 , Coherence:  -1.654842282318023 

Topic  1  is assigned to  33728  documents -  'Diamond Hands', Buy and Hold
Avg. probability:  0.7254267229448293 , Coherence:  -1.0572889302152175 

Topic  3  is assigned to  18697  documents -  Opinions about Government, 'shills', troll posts, etc.
Avg. probability:  0.6774471010436743 , Coherence:  -0.6017384438101522 

Topic  4  is assigned to  3457  documents -  Shorting and Sharing Financial Data
Avg. probability:  0.6134465289956622 , Coherence:  -0.9406684796870864 

Topic  5  is assigned to  14331  documents -  General Advice/Questions about Stocks
Avg. probability:  0.6572868122738977 , Coherence:  -1.997324013178544 

Topic  6  is assigned to  3770  documents -  Social Media Links (Many Reposts)
Avg. probability:  0.6906755610958889 , Coherence:  -0.9958162531509893 

Topic  7  is assigned to  4541  documents -  Posts of Memes/Images
Avg. pro

In [221]:
df_10k['topic'] = df_10k['prime_topic'].map(topic_dict)

In [179]:
df_10k = df_10k[['id', 'title', 'selftext', 'prime_topic', 'topic',
       'sec_topic', 'prime_score', 'sec_score', 'topic_scores']]

In [222]:
#df_10k.to_csv('df_10k.csv', index = False)
pd.read_csv('df_10k.csv')

## 7.2 Topic Exploration on 20K Model

In [19]:
print(df_20k.shape)
df_20k.head()

NameError: name 'df_20k' is not defined

In [57]:
doc_topic_mat = lda_20k.transform(vectorizer_20k['matrix'])

In [58]:
print(doc_topic_mat.shape)

(94039, 10)


In [61]:
##ADD NEW COLUMNS TO DATAFRAME WITH TOPIC AND SCORES.

# df_20k.reset_index(inplace = True)
# df_20k['prime_topic'] = None
# df_20k['sec_topic'] = None
# df_20k['prime_score'] = None
# df_20k['sec_score'] = None
# df_20k['topic_scores'] = None
# for i in df_20k.index:
#     df_20k['prime_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][0]
#     df_20k['sec_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][1]
#     df_20k['prime_score'][i] = np.sort(doc_topic_mat[i])[::-1][0]
#     df_20k['sec_score'][i] = np.sort(doc_topic_mat[i])[::-1][1]
#     df_20k['topic_scores'][i] = json.dumps(list(doc_topic_mat[i]))
# df_20k.head()

In [62]:
# df_20k.to_csv('df_20k.csv', index = False)
df_20k = pd.read_csv('df_20k.csv')
df_20k.head()

Unnamed: 0,index,id,title,selftext,prime_topic,sec_topic,prime_score,sec_score,topic_scores
0,1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,0,7,0.408516,0.402199,"[0.4085159025234814, 0.011113989904887907, 0.0..."
1,3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,5,0,0.963988,0.004003,"[0.004002516052168223, 0.004002464292135745, 0..."
2,5,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,0,5,0.396185,0.371816,"[0.39618512853474486, 0.10163854957411877, 0.0..."
3,7,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",7,1,0.497566,0.469094,"[0.0041677727694955, 0.469093585637319, 0.0041..."
4,9,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,5,0,0.793692,0.150852,"[0.15085249345911897, 0.0010311857052622362, 0..."


In [63]:
df_10k.head()

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,6,Social Media Links (Many Reposts),5,0.4937,0.417392,"[0.011115580511175473, 0.011114949546913212, 0..."
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,6,Social Media Links (Many Reposts),9,0.877783,0.090211,"[0.004000481503935025, 0.004001206442958103, 0..."
2,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,8,News and Earnings Reports,3,0.579641,0.196707,"[0.0026319080133597192, 0.0026324819716669226,..."
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",3,"Opinions about Government, 'shills', troll pos...",6,0.459412,0.397844,"[0.004166976999316615, 0.004167743853592299, 0..."
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,8,News and Earnings Reports,1,0.854413,0.137253,"[0.001041753250742906, 0.13725262819231285, 0...."


In [67]:
for l in top_terms:
    print(l)

['share', 'account', 'transfer', 'broke', 'fidelity', 'drs', 'robinhood', 'gme', 'use', 'trade']
['ape', 'just', 'hold', 'buy', 'fuck', 'like', 'gme', 'know', 'make', 'want']
['com', 'http', 'reddit', 'reddit com', 'www', 'http www', 'www reddit', 'www reddit com', 'http www reddit', 'comments']
['post', 'people', 'know', 'ape', 'just', 'make', 'like', 'think', 'dd', 'gme']
['gme', 'share', 'short', 'volume', '000', 'day', 'etf', '2021', 'data', '10']
['share', 'price', 'short', 'sell', 'buy', 'stock', 'gme', 'squeeze', 'market', 'just']
['http', 'com', 'www', 'http www', 'gamestop', 'html', 'twitter', 'video', 'twitter com', 'youtube']
['png', 'x200b', 'http', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'preview redd']
['gamestop', 'company', 'game', 'cohen', 'year', 'new', 'news', 'board', 'vote', 'ryan']
['http', 'market', 'www', 'http www', 'short', 'sec', 'com', 'trade', 'fund', 'security']


In [68]:
for l in get_top_terms(lda_20k.components_, vectorizer_20k['features']):
    print(l)

['ape', 'just', 'like', 'money', 'make', 'fuck', 'hold', 'buy', 'know', 'time']
['just', 'ape', 'know', 'post', 'like', 'gme', 'think', 'people', 'dd', 'make']
['share', 'account', 'transfer', 'broke', 'fidelity', 'gme', 'buy', 'just', 'drs', 'ape']
['short', 'market', 'company', 'sec', 'share', 'fund', 'stock', 'gamestop', 'trade', 'investor']
['http', 'com', 'www', 'http www', 'html', 'poll', 'org', 'amp', 'html http', 'video']
['com', 'http', 'www', 'http www', 'gamestop', 'gme', '000', 'stock', '2021', 'youtube']
['share', 'price', 'short', 'buy', 'sell', 'gme', 'stock', 'squeeze', 'market', 'day']
['png', 'http', 'x200b', 'redd', 'format', 'width', 'png width', 'format png', 'auto', 'png auto webp']
['com', 'http', 'reddit', 'www', 'http www', 'reddit com', 'www reddit', 'www reddit com', 'http www reddit', 'comments']
['http', 'jpg', 'x200b', 'format', 'auto', 'redd', 'width', 'preview', 'webp', 'preview redd']


# 8.0 LDA on train/test split

## 8.1 Prep Data

In [19]:
df = pd.read_csv("gme/submissions_reddit.csv")
print(df.columns)
df = df[['id', 'created', 'edited', 'title', 'selftext']] 
display(df.head())
df['edited'] = pd.to_datetime(df['edited'])
df['created'] = pd.to_datetime(df['created'])
df['datetime'] = df[['created', 'edited']].max(axis = 1)
df.sort_values(by = 'datetime', inplace = True)
df['month'] = df['datetime'].dt.month

Index(['id', 'author', 'created', 'retrieved', 'edited', 'pinned', 'archived',
       'locked', 'removed', 'deleted', 'is_self', 'is_video',
       'is_original_content', 'title', 'link_flair_text', 'upvote_ratio',
       'score', 'gilded', 'total_awards_received', 'num_comments',
       'num_crossposts', 'selftext', 'thumbnail', 'shortlink'],
      dtype='object')


Unnamed: 0,id,created,edited,title,selftext
0,ko4pii,2021-01-01 04:08:51,1970-01-01 00:00:00,GME to the moon 🚀🚀,[deleted]
1,kqfajb,2021-01-04 19:02:26,1970-01-01 00:00:00,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...
2,kqjh2t,2021-01-04 22:17:23,1970-01-01 00:00:00,Short Squeeze Incoming 🚀🚀🚀🚀🚀🚀🚀,[deleted]
3,kqvp7l,2021-01-05 10:19:59,1970-01-01 00:00:00,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...
4,krcwch,2021-01-06 01:19:17,1970-01-01 00:00:00,You already know what we must do brothers and ...,[deleted]


In [34]:
df.sample(5)

Unnamed: 0,id,created,edited,title,selftext,datetime,month
80730,m3fkhd,2021-03-12 11:22:44,1970-01-01 00:00:00,Why,WHY Does people act like paper hands because i...,2021-03-12 11:22:44,3
81388,m3jckj,2021-03-12 15:02:34,1970-01-01 00:00:00,I like crushing me some paywalls.,,2021-03-12 15:02:34,3
133765,md0325,2021-03-25 14:54:37,1970-01-01 00:00:00,"Even if you weren’t in my food chain, I would ...",,2021-03-25 14:54:37,3
229284,omnomw,2021-07-18 09:58:46,1970-01-01 00:00:00,"Looks like meat's back on the menu, boys!",[removed],2021-07-18 09:58:46,7
55872,lye6pd,2021-03-05 15:23:56,2021-03-05 15:37:18,"Fuck it, sold all my other positions to go ful...",I originally sold everything and splitted int...,2021-03-05 15:37:18,3


In [35]:
# df = build_and_lemmatize_text_list(df, lemmatize = True, save_string = "GME_df_w_datetime")

Dateframe size prior to dropping stuff: 273327
New GME_df_w_datetime.csv file saved to directory.


In [10]:
# df.to_csv('GME_df_w_datetime', index = False)
df = pd.read_csv('GME_df_w_datetime')

In [11]:
df.sort_values(by = 'datetime', inplace = True)
df.reset_index(inplace = True, drop = True)
df[85325:85335]

Unnamed: 0,id,created,edited,title,selftext,datetime,month
85325,nommqz,2021-05-30 22:48:48,1970-01-01 00:00:00,Worked 6 days for months strait. 54 to 60 hrs ...,i ’ ve pack my lunch every day . i stop smoke ...,2021-05-30 22:48:48,5
85326,non7oj,2021-05-30 23:21:44,1970-01-01 00:00:00,Transferring Robinhood to Fidelity,i ’ m in the process of of transfer my robinho...,2021-05-30 23:21:44,5
85327,noni9a,2021-05-30 23:38:37,1970-01-01 00:00:00,Champagne wishes,right now i ’ m dip my hotdog in mayonnaise bu...,2021-05-30 23:38:37,5
85328,nonse9,2021-05-30 23:55:19,1970-01-01 00:00:00,"So be real with me, what price are you actuall...",there 's a lot of insane number go around . ju...,2021-05-30 23:55:19,5
85329,noowbp,2021-05-31 01:01:30,2021-05-31 01:11:56,GME/Apple partnership,saw on bloomberg tv today apple want to open n...,2021-05-31 01:11:56,5
85330,nooak2,2021-05-31 00:24:45,2021-05-31 01:37:25,This is HOW we can build a better world after ...,**moass near** . the message be clear . the qu...,2021-05-31 01:37:25,5
85331,noqwau,2021-05-31 02:45:04,1970-01-01 00:00:00,EMERALD HANDS!!!! $GME !!!!!,diamond hand be old news ! ! ! emerald hand be...,2021-05-31 02:45:04,5
85332,noqrj7,2021-05-31 02:40:11,2021-05-31 02:51:04,Shit is going down!,**moass get close . ** too much `` coincidence...,2021-05-31 02:51:04,5
85333,not55z,2021-05-31 04:07:41,1970-01-01 00:00:00,Is short selling or naked short selling more p...,"this may be common knowledge at this point , b...",2021-05-31 04:07:41,5
85334,notf61,2021-05-31 04:17:54,1970-01-01 00:00:00,The media has FOMO?!,it ’ s funny to me how the medium be now talk ...,2021-05-31 04:17:54,5


In [12]:
df_train = df.iloc[:85329].dropna(how = 'all')
df_test = df.iloc[85329:].dropna(how = 'all')

In [13]:
df_train.tail()

Unnamed: 0,id,created,edited,title,selftext,datetime,month
85324,nome3s,2021-05-30 22:35:42,1970-01-01 00:00:00,Help an Ape out!,my wife have humor me and trust me invest in g...,2021-05-30 22:35:42,5
85325,nommqz,2021-05-30 22:48:48,1970-01-01 00:00:00,Worked 6 days for months strait. 54 to 60 hrs ...,i ’ ve pack my lunch every day . i stop smoke ...,2021-05-30 22:48:48,5
85326,non7oj,2021-05-30 23:21:44,1970-01-01 00:00:00,Transferring Robinhood to Fidelity,i ’ m in the process of of transfer my robinho...,2021-05-30 23:21:44,5
85327,noni9a,2021-05-30 23:38:37,1970-01-01 00:00:00,Champagne wishes,right now i ’ m dip my hotdog in mayonnaise bu...,2021-05-30 23:38:37,5
85328,nonse9,2021-05-30 23:55:19,1970-01-01 00:00:00,"So be real with me, what price are you actuall...",there 's a lot of insane number go around . ju...,2021-05-30 23:55:19,5


In [14]:
df_test.head()

Unnamed: 0,id,created,edited,title,selftext,datetime,month
85329,noowbp,2021-05-31 01:01:30,2021-05-31 01:11:56,GME/Apple partnership,saw on bloomberg tv today apple want to open n...,2021-05-31 01:11:56,5
85330,nooak2,2021-05-31 00:24:45,2021-05-31 01:37:25,This is HOW we can build a better world after ...,**moass near** . the message be clear . the qu...,2021-05-31 01:37:25,5
85331,noqwau,2021-05-31 02:45:04,1970-01-01 00:00:00,EMERALD HANDS!!!! $GME !!!!!,diamond hand be old news ! ! ! emerald hand be...,2021-05-31 02:45:04,5
85332,noqrj7,2021-05-31 02:40:11,2021-05-31 02:51:04,Shit is going down!,**moass get close . ** too much `` coincidence...,2021-05-31 02:51:04,5
85333,not55z,2021-05-31 04:07:41,1970-01-01 00:00:00,Is short selling or naked short selling more p...,"this may be common knowledge at this point , b...",2021-05-31 04:07:41,5


## 8.2 Create doc list and original vectorizer

In [15]:
doc_list = list(df_train['selftext'])
doc_list_test = list(df_test['selftext'])

In [16]:
vectorizer = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.90, ngram_range = (1,3), stop_words = 'english')

In [17]:
#SAVES PICKLE OF ABOVE MODEL

# with open('vectorizer_train.pkl', 'wb') as f:
#     pickle.dump(vectorizer, f)

#OPENS SAVED MODEL FROM PICKLE
# with open('vectorizer_train', 'rb') as f:
#     vectorizer = pickle.load(f)

In [18]:
print(vectorizer.keys())

dict_keys(['matrix', 'vectorizer', 'features', 'parameters'])


## 8.3 LDA modeling

In [19]:
def prepare_for_topic_naming(vectorizer, df_train, df_test, doc_list_train, doc_list_test):    
    tic = timeit.default_timer()

    lda = LatentDirichletAllocation(n_components = 10, random_state = 0)
    lda.fit(vectorizer['matrix'])

    toc = timeit.default_timer()
    print("Time to train LDA: " + str((toc - tic)/60) + " minutes")
    
    tf_matrix_test = vectorizer['vectorizer'].transform(doc_list_test)
    doc_weights_train = lda.transform(vectorizer['matrix'])
    doc_weights_test = lda.transform(tf_matrix_test)
    
    print("\nDoc weight matrix stats:")
    print("Train Shape: ", doc_weights_train.shape)
    print("Example of weights from train doc 1: ", doc_weights_train[0], "\n")
    print("Test Shape: ", doc_weights_test.shape)
    print("Example of weights from test doc 1: ", doc_weights_test[-1], "\n")
    
    print("Now populating topic data into dataframes...")
    df_train.reset_index(inplace = True, drop = True)
    df_train['prime_topic'] = None
    df_train['sec_topic'] = None
    df_train['prime_score'] = None
    df_train['sec_score'] = None
    df_train['topic_scores'] = None
    for i in df_train.index:
        df_train['prime_topic'][i] = np.argsort(doc_weights_train[i])[::-1][0]
        df_train['sec_topic'][i] = np.argsort(doc_weights_train[i])[::-1][1]
        df_train['prime_score'][i] = np.sort(doc_weights_train[i])[::-1][0]
        df_train['sec_score'][i] = np.sort(doc_weights_train[i])[::-1][1]
        df_train['topic_scores'][i] = json.dumps(list(doc_weights_train[i]))
    
    print("Finished Train dataframe, now working on Test dataframe...")
    df_test.reset_index(inplace = True, drop = True)
    df_test['prime_topic'] = None
    df_test['sec_topic'] = None
    df_test['prime_score'] = None
    df_test['sec_score'] = None
    df_test['topic_scores'] = None
    for i in df_test.index:
        df_test['prime_topic'][i] = np.argsort(doc_weights_test[i])[::-1][0]
        df_test['sec_topic'][i] = np.argsort(doc_weights_test[i])[::-1][1]
        df_test['prime_score'][i] = np.sort(doc_weights_test[i])[::-1][0]
        df_test['sec_score'][i] = np.sort(doc_weights_test[i])[::-1][1]
        df_test['topic_scores'][i] = json.dumps(list(doc_weights_test[i]))
        
    toc2 = timeit.default_timer()
    print("Time to finish test/train dataframes: " + str((toc2 - tic)/60) + " minutes")

    df_temp = build_and_simplify_dataframe(['gme'])
    df_temp = build_and_lemmatize_text_list(df_temp, lemmatize = False, save_string = None)
    df_temp[['id', 'selftext']]
    
    df = pd.concat([df_train, df_test])
    df = df.rename({'selftext': 'selftext_lemmatized'}, axis = 1)
    df = pd.merge(df, df_temp, on = "id")
    
    df.rename({'title_x': "title"}, axis = 1, inplace = True)
    df = df[['id', 'title', 'selftext', 'datetime', 'prime_topic', 
         'sec_topic', 'prime_score', 'sec_score', 'topic_scores']]
    
    print("Order of output objects: lda model, dataframe with train and test data combined")
    return lda, df


In [38]:
#SAVES PICKLE OF ABOVE MODEL

# with open('lda_train.pkl', 'wb') as f:
#     pickle.dump(lda, f)

#OPENS SAVED MODEL FROM PICKLE
# with open('lda_train', 'rb') as f:
#     lda = pickle.load(f)

### 8.3.1 Max Doc Frequency = 0.9

In [112]:
vectorizer_1 = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.90, ngram_range = (1,3), stop_words = 'english')

lda_1, df_1 = prepare_for_topic_naming(vectorizer_1, df_train, df_test, doc_list, doc_list_test)

Time to train LDA: 5.484388238082951 minutes
Doc weight matrix stats:
Train Shape:  (85329, 10)
Example of weights from train doc 1:  [0.01111668 0.01111465 0.01111362 0.01111231 0.01111514 0.14023141
 0.0111125  0.39890918 0.38306091 0.0111136 ] 

Test Shape:  (8710, 10)
Example of weights from test doc 1:  [0.00090124 0.06974434 0.00090127 0.00090095 0.00090119 0.29276439
 0.000901   0.57761735 0.00090106 0.05446721] 

Now populating topic data into dataframes...
Finished Train dataframe, now working on Test dataframe...
Time to finish test/train dataframes: 12.02119467216544 minutes
Dateframe size prior to dropping stuff: 273327
Order of output objects: lda model, dataframe with train and test data combined


In [119]:
cohe_score_func(lda_1, vectorizer_1['matrix'], vectorizer_1['features'])

(-1.1585778697460793,
 [-1.836654055116084,
  -1.2009414948737098,
  -1.503770357701922,
  -0.7463482378860898,
  -1.2567015732635611,
  -1.186732743050535,
  -0.33669922064323565,
  -1.185688341090434,
  -0.8651699461805268,
  -1.467072727654694])

In [113]:
top_terms = get_top_terms(lda_1.components_, vectorizer_1['features'])
for terms in top_terms:
    print(terms)

['http', 'com', 'www', 'http www', 'gamestop', 'html', '2021', 'youtube', 'watch', 'youtube com']
['buy', 'hold', 'ape', 'just', 'share', 'gme', 'sell', 'like', 'dip', 'stock']
['price', 'day', 'sell', 'buy', 'option', 'volume', 'stock', 'market', 'trade', 'order']
['www', 'http www', 'com', 'http', 'reddit', 'www reddit com', 'www reddit', 'http www reddit', 'reddit com', 'poll']
['share', 'short', '000', 'sell', 'buy', 'price', 'million', 'gme', 'stock', '100']
['short', 'gme', 'squeeze', 'stock', 'just', 'happen', 'know', 'think', 'ape', 'hedge']
['png', 'http', 'x200b', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'auto webp']
['ape', 'just', 'make', 'know', 'like', 'people', 'think', 'fuck', 'post', 'good']
['http', 'com', 'gme', 'reddit', 'www', 'reddit com', 'http www', 'www reddit', 'www reddit com', 'comments']
['market', 'sec', 'trade', 'security', 'http', 'short', 'citadel', 'rule', 'stock', 'fund']


In [114]:
topic_num = 1
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_1[df_1['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
# print('\nTEXTS:')
# for i, text in enumerate(df_temp['selftext']):
#     print("TEXT ", i, ":\n", text, "\n\n\n")
# df_temp

Top Terms:  ['buy', 'hold', 'ape', 'just', 'share', 'gme', 'sell', 'like', 'dip', 'stock'] 

TITLES:
0 .      My totally real positions for the algorithm
1 .      when hedgies & cnbc say sell!!!!!!!!
2 .      Fug you all MODS !!! GME TO THE MOON
3 .      Posting this across the subs to remind all the Apes that this an international community and apes have different backgrounds and opinions. So please keep an open mind and keep it simple. Buy and Hodl
4 .      Don’t listen to anyone with specific advise
5 .      HOLD
6 .      If the floor is $60,000,000 then gme is on a 99.999997% discount today
7 .      Does anyone know if there is a counterfactual wallet coming?!! I have no way of finding out and I can’t seem to find any posts about it! /s
8 .      Ok you fucking crayon eating retards. Listen very fucking closely. I. Will. Say. It. Slowly....
9 .      OK Hear me out!


### 8.3.2 Max Doc Frequency = 0.5

In [115]:
vectorizer_2 = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.50, ngram_range = (1,3), stop_words = 'english')

lda_2, df_2 = prepare_for_topic_naming(vectorizer_2, df_train, df_test, doc_list, doc_list_test)

Time to train LDA: 5.054723053084065 minutes
Doc weight matrix stats:
Train Shape:  (85329, 10)
Example of weights from train doc 1:  [0.01111668 0.01111465 0.01111362 0.01111231 0.01111514 0.14023141
 0.0111125  0.39890918 0.38306091 0.0111136 ] 

Test Shape:  (8710, 10)
Example of weights from test doc 1:  [0.00090124 0.06974434 0.00090127 0.00090095 0.00090119 0.29276439
 0.000901   0.57761735 0.00090106 0.05446721] 

Now populating topic data into dataframes...
Finished Train dataframe, now working on Test dataframe...
Time to finish test/train dataframes: 11.653124084323645 minutes
Dateframe size prior to dropping stuff: 273327
Order of output objects: lda model, dataframe with train and test data combined


In [116]:
cohe_score_func(lda_2, vectorizer_2['matrix'], vectorizer_2['features'])

(-1.1585778697460793,
 [-1.836654055116084,
  -1.2009414948737098,
  -1.503770357701922,
  -0.7463482378860898,
  -1.2567015732635611,
  -1.186732743050535,
  -0.33669922064323565,
  -1.185688341090434,
  -0.8651699461805268,
  -1.467072727654694])

In [117]:
top_terms = get_top_terms(lda_2.components_, vectorizer_2['features'])
for terms in top_terms:
    print(terms)

['http', 'com', 'www', 'http www', 'gamestop', 'html', '2021', 'youtube', 'watch', 'youtube com']
['buy', 'hold', 'ape', 'just', 'share', 'gme', 'sell', 'like', 'dip', 'stock']
['price', 'day', 'sell', 'buy', 'option', 'volume', 'stock', 'market', 'trade', 'order']
['www', 'http www', 'com', 'http', 'reddit', 'www reddit com', 'www reddit', 'http www reddit', 'reddit com', 'poll']
['share', 'short', '000', 'sell', 'buy', 'price', 'million', 'gme', 'stock', '100']
['short', 'gme', 'squeeze', 'stock', 'just', 'happen', 'know', 'think', 'ape', 'hedge']
['png', 'http', 'x200b', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'auto webp']
['ape', 'just', 'make', 'know', 'like', 'people', 'think', 'fuck', 'post', 'good']
['http', 'com', 'gme', 'reddit', 'www', 'reddit com', 'http www', 'www reddit', 'www reddit com', 'comments']
['market', 'sec', 'trade', 'security', 'http', 'short', 'citadel', 'rule', 'stock', 'fund']


In [118]:
topic_num = 1
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_2[df_2['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
# print('\nTEXTS:')
# for i, text in enumerate(df_temp['selftext']):
#     print("TEXT ", i, ":\n", text, "\n\n\n")
# df_temp

Top Terms:  ['buy', 'hold', 'ape', 'just', 'share', 'gme', 'sell', 'like', 'dip', 'stock'] 

TITLES:
0 .      My totally real positions for the algorithm
1 .      when hedgies & cnbc say sell!!!!!!!!
2 .      Fug you all MODS !!! GME TO THE MOON
3 .      Posting this across the subs to remind all the Apes that this an international community and apes have different backgrounds and opinions. So please keep an open mind and keep it simple. Buy and Hodl
4 .      Don’t listen to anyone with specific advise
5 .      HOLD
6 .      If the floor is $60,000,000 then gme is on a 99.999997% discount today
7 .      Does anyone know if there is a counterfactual wallet coming?!! I have no way of finding out and I can’t seem to find any posts about it! /s
8 .      Ok you fucking crayon eating retards. Listen very fucking closely. I. Will. Say. It. Slowly....
9 .      OK Hear me out!


### 8.3.3 Max Doc Frequency = 0.75

In [25]:
vectorizer_3 = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.75, ngram_range = (1,3), stop_words = 'english')

lda_3, df_3 = prepare_for_topic_naming(vectorizer_3, df_train, df_test, doc_list, doc_list_test)

Time to train LDA: 4.979224369007473 minutes

Doc weight matrix stats:
Train Shape:  (85329, 10)
Example of weights from train doc 1:  [0.01111668 0.01111465 0.01111362 0.01111231 0.01111513 0.14021322
 0.0111125  0.39895582 0.38303246 0.0111136 ] 

Test Shape:  (8710, 10)
Example of weights from test doc 1:  [0.00090124 0.06979842 0.00090127 0.00090095 0.00090119 0.2926495
 0.000901   0.57766661 0.00090106 0.05447876] 

Now populating topic data into dataframes...
Finished Train dataframe, now working on Test dataframe...
Time to finish test/train dataframes: 11.349977501761169 minutes
Dateframe size prior to dropping stuff: 273327
Order of output objects: lda model, dataframe with train and test data combined


In [26]:
cohe_score_func(lda_3, vectorizer_3['matrix'], vectorizer_3['features'])

(-1.1585778697460793,
 [-1.836654055116084,
  -1.2009414948737098,
  -1.503770357701922,
  -0.7463482378860898,
  -1.2567015732635611,
  -1.186732743050535,
  -0.33669922064323565,
  -1.185688341090434,
  -0.8651699461805268,
  -1.467072727654694])

In [27]:
top_terms = get_top_terms(lda_3.components_, vectorizer_3['features'])
for terms in top_terms:
    print(terms)

['http', 'com', 'www', 'http www', 'gamestop', 'html', '2021', 'youtube', 'watch', 'youtube com']
['buy', 'hold', 'ape', 'just', 'share', 'gme', 'sell', 'like', 'dip', 'stock']
['price', 'day', 'sell', 'buy', 'option', 'volume', 'stock', 'market', 'trade', 'order']
['www', 'http www', 'com', 'http', 'reddit', 'www reddit com', 'www reddit', 'http www reddit', 'reddit com', 'poll']
['share', 'short', '000', 'sell', 'buy', 'price', 'million', 'gme', 'stock', '100']
['short', 'gme', 'squeeze', 'stock', 'just', 'happen', 'know', 'think', 'ape', 'hedge']
['png', 'http', 'x200b', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'auto webp']
['ape', 'just', 'make', 'know', 'like', 'people', 'think', 'fuck', 'post', 'good']
['http', 'com', 'gme', 'reddit', 'www', 'reddit com', 'http www', 'www reddit', 'www reddit com', 'comments']
['market', 'sec', 'trade', 'security', 'http', 'short', 'citadel', 'rule', 'stock', 'fund']


In [83]:
topic_num = 1
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_3[df_3['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
# print('\nTEXTS:')
# for i, text in enumerate(df_temp['selftext']):
#     print("TEXT ", i, ":\n", text, "\n\n\n")
# df_temp

Top Terms:  ['buy', 'hold', 'ape', 'just', 'share', 'gme', 'sell', 'like', 'dip', 'stock'] 

TITLES:
0 .      My totally real positions for the algorithm
1 .      when hedgies & cnbc say sell!!!!!!!!
2 .      Fug you all MODS !!! GME TO THE MOON
3 .      Posting this across the subs to remind all the Apes that this an international community and apes have different backgrounds and opinions. So please keep an open mind and keep it simple. Buy and Hodl
4 .      Don’t listen to anyone with specific advise
5 .      HOLD
6 .      If the floor is $60,000,000 then gme is on a 99.999997% discount today
7 .      Does anyone know if there is a counterfactual wallet coming?!! I have no way of finding out and I can’t seem to find any posts about it! /s
8 .      Ok you fucking crayon eating retards. Listen very fucking closely. I. Will. Say. It. Slowly....
9 .      OK Hear me out!


### 8.3.4 Max Doc Frequency = 0.75 and extra stop words

In [20]:
swrds = set(ENGLISH_STOP_WORDS).union({"www", "http", "https", "com", "html"})

##Free up some memory
#del df_1, df_2, df_3, lda_1, lda_2, lda_3, vectorizer_1, vectorizer_2, vectorizer_3

In [21]:
vectorizer_4 = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.75, ngram_range = (1,3), stop_words = swrds)

lda_4, df_4 = prepare_for_topic_naming(vectorizer_4, df_train, df_test, doc_list, doc_list_test)

Time to train LDA: 5.0076861899501335 minutes

Doc weight matrix stats:
Train Shape:  (85329, 10)
Example of weights from train doc 1:  [0.30464959 0.01428647 0.01428855 0.01428704 0.01428645 0.01428808
 0.01428771 0.01429102 0.01429353 0.58104158] 

Test Shape:  (8710, 10)
Example of weights from test doc 1:  [0.12458363 0.00089306 0.09146661 0.09389466 0.00089293 0.14412965
 0.08791497 0.00089312 0.00089307 0.4544383 ] 

Now populating topic data into dataframes...
Finished Train dataframe, now working on Test dataframe...
Time to finish test/train dataframes: 11.560699962001914 minutes
Dateframe size prior to dropping stuff: 273327
Order of output objects: lda model, dataframe with train and test data combined


In [22]:
cohe_score_func(lda_4, vectorizer_4['matrix'], vectorizer_4['features'])

(-1.5247413730931416,
 [-2.8594325327854366,
  -1.7481955068476402,
  -1.4666208635086697,
  -1.8943320940468609,
  -0.1491350681085174,
  -0.9986518438049935,
  -1.7966048097977156,
  -2.0940599775792585,
  -1.3430657896752944,
  -0.8973152447770275])

In [55]:
top_terms = get_top_terms(lda_4.components_, vectorizer_4['features'])
for terms in top_terms:
    print(terms)

['000', 'youtube', 'watch', 'data', 'youtube watch', 'twitter', 'gme', 'join', 'er', 'volume']
['reddit', 'comments', 'gme', 'reddit gme', 'gme comments', 'reddit gme comments', 'share', 'amp', 'video', 'utm_source']
['buy', 'hold', 'gme', 'day', 'just', 'price', 'dip', 'ape', 'today', 'hand']
['gamestop', 'stock', 'company', 'like', 'gme', 'game', '2021', 'news', 'like stock', 'cohen']
['png', 'x200b', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'preview redd', 'auto webp']
['post', 'just', 'dd', 'like', 'know', 'ape', 'people', 'think', 'financial', 'gme']
['ape', 'just', 'want', 'share', 'moon', 'gme', 'hodl', 'love', 'like', 'transfer']
['account', 'trade', 'vote', 'market', 'broke', 'security', 'use', 'sec', 'rule', 'tax']
['share', 'short', 'price', 'sell', 'buy', 'stock', 'gme', 'market', 'option', 'cover']
['make', 'money', 'people', 'fuck', 'just', 'know', 'think', 'like', 'time', 'gme']


In [74]:
topic_num = 9
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_4[df_4['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['make', 'money', 'people', 'fuck', 'just', 'know', 'think', 'like', 'time', 'gme'] 

TITLES:
0 .      "We all apes here. No difference" "Money don't care about gender"?
1 .      Clowns are the original Apes...you have much to learn.
2 .      The End of it All
3 .      How Jim Cramer COULD become a legend
4 .      Charity is to also ensure you can be charitable later.
5 .      Charity is not just in the moment
6 .      I'm Holding GameStop to Save the World
7 .      John Parr - St. Elmo's Fire (Man in Motion)
8 .      GME apes it is not about stonks!
9 .      Just now I was reviewing my notes from church this morning, and I thought y’all might like them also.

TEXTS:
TEXT  0 :
 None of us should need to have our difference erased as a price of being a part of a community.

Some people say this is "injecting" race, gender, sexuality, religion, etc., "into" our community, to "divide" it by such notions, when economic interest to bring down HF rigging of the markets, and desir

Unnamed: 0,id,title,selftext,datetime,prime_topic,sec_topic,prime_score,sec_score,topic_scores
44058,m84wvn,"""We all apes here. No difference"" ""Money don't...",None of us should need to have our difference ...,2021-03-19 00:37:14,9,3,0.998725,0.000142,"[0.0001416780930950893, 0.0001416740190074061,..."
71178,mixirp,Clowns are the original Apes...you have much t...,I’m not talking suburban birthday clowns which...,2021-04-03 00:28:26,9,6,0.996917,0.000343,"[0.0003425367461504183, 0.0003424968674162758,..."
84039,nh3qpw,The End of it All,The End of it All\n\n\n \nWhat was it like in ...,2021-05-20 15:50:38,9,4,0.996852,0.00035,"[0.000349758984958592, 0.00034968825905934176,..."
19969,lwyswq,How Jim Cramer COULD become a legend,Note: PLEASE be polite. This is not a thread t...,2021-03-03 17:05:05,9,5,0.995693,0.000479,"[0.0004785593415097094, 0.0004785573003091831,..."
78862,mxnvti,Charity is to also ensure you can be charitabl...,There once was a Rich Man who decided to wonde...,2021-04-24 16:53:32,9,6,0.995186,0.000535,"[0.0005348346940429459, 0.0005347809171031052,..."
79248,mz2kd0,Charity is not just in the moment,\nThere once was a Rich Man who decided to won...,2021-04-26 17:08:55,9,6,0.995186,0.000535,"[0.0005348346940429459, 0.0005347809171031052,..."
55439,mc7zzt,I'm Holding GameStop to Save the World,"I wanna get this straight, I don't consider my...",2021-03-24 15:03:57,9,6,0.994914,0.000565,"[0.0005650124216223938, 0.0005650728179923077,..."
65117,mfhi1s,John Parr - St. Elmo's Fire (Man in Motion),Anyone else got any GME appropriate music to k...,2021-03-29 04:17:29,9,2,0.994478,0.000614,"[0.0006135494863350947, 0.0006135290113283251,..."
44547,m8c0ul,GME apes it is not about stonks!,"It's about people telling us, that we stay poo...",2021-03-19 07:39:59,9,6,0.994444,0.000618,"[0.0006173398842194828, 0.0006173254972320462,..."
48931,ma155j,Just now I was reviewing my notes from church ...,Disclaimer: this is cut/paste from my own serm...,2021-03-21 17:10:49,9,0,0.994302,0.000633,"[0.000633452494631692, 0.0006330226503653952, ..."


In [77]:
topics_4[9] = "Posts Against Hedge Funds and Power by Wealth"

In [78]:
topics_4

{0: 'Unclear Topic - SuperStonk and General GME Data',
 1: 'Posts of URLs with UTM Parameters',
 2: 'Diamond Hands/Hold on GME',
 3: 'News and Press Releases',
 4: 'Meme/Photo Posts',
 5: 'Complaints/Tension about Other Posts/Users',
 6: 'Mix - HODL Nonsense and User Personal Stories',
 7: 'Rules and Regulations',
 8: 'Strategies and Mechanics of Shorting',
 9: 'Posts Against Hedge Funds and Power by Wealth'}

{0: 'Unclear Topic - SuperStonk and General GME Data',
 1: 'Posts of URLs with UTM Parameters',
 2: 'Diamond Hands/Hold on GME',
 3: 'News and Press Releases',
 4: 'Meme/Photo Posts',
 5: 'Complaints/Tension about Other Posts/Users',
 6: 'Mix - HODL Nonsense and User Personal Stories',
 7: 'Rules and Regulations',
 8: 'Strategies and Mechanics of Shorting',
 9: 'Posts Against Hedge Funds and Power by Wealth'}

In [201]:
topics_5

{0: 'Gamestop as a Business/Store',
 1: 'MOD (moderator) Announcements',
 2: 'The GME Short Squeeze',
 3: 'Unclear Topic - Some posts about holding, FINRA, and a lot of external links',
 4: 'Brokerage Accounts',
 5: 'Regulatory Matters (Direct and Indirect)',
 6: 'Mix - GME Price Movements and Nonsense Posts',
 7: "Unity Amoung 'Apes'",
 8: 'General Posts/Knowledge About Understanding Stocks',
 9: 'Meme & Photo Posts'}

### 8.3.5 Max Doc Frequency = 0.75 and even more stop words

In [28]:
swrds = set(ENGLISH_STOP_WORDS).union({"www", "http", "https", "com", "html", "gme", "reddit"})

##Free up some memory
#del df_1, df_2, lda_1, lda_2, vectorizer_1, vectorizer_2

In [29]:
vectorizer_5 = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.75, ngram_range = (1,3), stop_words = swrds)

lda_5, df_5 = prepare_for_topic_naming(vectorizer_5, df_train, df_test, doc_list, doc_list_test)

Time to train LDA: 4.937119523544485 minutes

Doc weight matrix stats:
Train Shape:  (85329, 10)
Example of weights from train doc 1:  [0.01429115 0.01428783 0.44297868 0.01428679 0.01429039 0.01428851
 0.44271058 0.01429144 0.01428833 0.0142863 ] 

Test Shape:  (8710, 10)
Example of weights from test doc 1:  [0.7852872  0.00090931 0.05582975 0.00090955 0.00090944 0.12063742
 0.03278938 0.00090939 0.0009094  0.00090916] 

Now populating topic data into dataframes...
Finished Train dataframe, now working on Test dataframe...
Time to finish test/train dataframes: 11.403484235921253 minutes
Dateframe size prior to dropping stuff: 273327
Order of output objects: lda model, dataframe with train and test data combined


In [30]:
cohe_scores = cohe_score_func(lda_5, vectorizer_5['matrix'], vectorizer_5['features'])
cohe_scores

(-1.6890403270607088,
 [-1.3215131953607304,
  -2.8277339562557864,
  -1.0282691580559715,
  -2.860302209654857,
  -1.6172970296768714,
  -1.7615869762856624,
  -2.4598393171299437,
  -1.2404946818445266,
  -1.6242316782342208,
  -0.1491350681085174])

In [31]:
top_terms = get_top_terms(lda_5.components_, vectorizer_5['features'])
for terms in top_terms:
    print(terms)

['just', 'like', 'think', 'make', 'game', 'know', 'fuck', 'gamestop', 'time', 'people']
['comments', 'share', 'vote', 'post', 'utm_source', 'utm_medium', 'utm_source share', 'join', 'er', 'shareholder']
['share', 'buy', 'price', 'sell', 'short', 'stock', 'hold', 'squeeze', 'just', 'think']
['ape', 'share', 'short', 'poll', 'borrow', 'amp', 'brain', 'video', 'smooth', 'hold']
['share', 'account', 'order', 'transfer', 'trade', 'broke', 'sell', 'fidelity', 'robinhood', 'just']
['market', 'gamestop', 'short', 'company', 'stock', 'fund', 'sec', 'investor', 'citadel', 'news']
['000', 'stock', 'watch', 'like', 'youtube', 'youtube watch', 'like stock', 'tax', '10', '000 000']
['ape', 'just', 'hold', 'know', 'make', 'like', 'people', 'want', 'fuck', 'dd']
['short', 'price', 'day', 'volume', 'option', 'market', 'trade', 'data', 'etf', 'stock']
['png', 'x200b', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'preview redd', 'auto webp']


In [33]:
topic_num = 6
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_5[df_5['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(20)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['000', 'stock', 'watch', 'like', 'youtube', 'youtube watch', 'like stock', 'tax', '10', '000 000'] 

TITLES:
0 .      My Honest Thoughts on GME
1 .      What if we filled the sub with stupidity for bots to feed their algos with shit
2 .      My totally real positions for the algorithm
3 .      JACKED TO THE TITS? Relieve some stress with bubble wrap
4 .      Some bubble wrap to get you through the weekend
5 .      It's gonna be a long and tough week, so have some bubble wrap to relieve the stress
6 .      Mod drama leaves you nervous?
7 .      I can't do DD, so here's some bubble wrap to relive your pre-squeeze stress
8 .      Diamantenhände 💎👐 German market is open! 🇩🇪
9 .      Diamantenhände 💎👐 German market is open 🇩🇪
10 .      She said: how $CUM do you have so much $CUM
11 .      Diamantenhände 💎👐 German market is open! 🇩🇪 Let's Go!
12 .      List of users who posted on r/FreeKarma4U in the past month
13 .      I LIKE THE STOCK
14 .      💎🙌HAPPY FRIDAY 💎🙌🚀HODL TIGHT🚀ST

Unnamed: 0,id,title,selftext,datetime,prime_topic,sec_topic,prime_score,sec_score,topic_scores
63744,melyvr,My Honest Thoughts on GME,I like the stock.\nI like the stock.\nI like t...,2021-03-27 20:29:38,6,0,0.999896,1.2e-05,"[1.1577273603376813e-05, 1.1576787072409193e-0..."
43916,m83h6y,What if we filled the sub with stupidity for b...,Imagine posting your positions like\n\n90@350 ...,2021-03-18 23:14:31,6,2,0.999598,4.5e-05,"[4.468376110568092e-05, 4.4683136475747796e-05..."
50599,matoph,My totally real positions for the algorithm,"69,420@6969 in $CUM 12345@4958 in $FUK 19479@9...",2021-03-22 17:55:44,6,7,0.999552,5e-05,"[4.9751265453959966e-05, 4.9751288853376966e-0..."
69411,mhuvt0,JACKED TO THE TITS? Relieve some stress with b...,HOW JACKED ARE YOU?\n\n>!pop!< >!pop!< >!pop!<...,2021-04-01 13:05:59,6,0,0.998594,0.000156,"[0.00015625207044282297, 0.000156250339488982,..."
63698,melfcp,Some bubble wrap to get you through the weekend,I've hidden a gme share for you since you can'...,2021-03-27 20:02:22,6,3,0.994674,0.000592,"[0.0005917873268629313, 0.0005917448389077283,..."
48101,m9gbyr,"It's gonna be a long and tough week, so have s...",>!pop!< >!pop!< >!pop!< >!pop!< >!pop!< >!pop!...,2021-03-20 21:54:22,6,8,0.994643,0.000595,"[0.0005952792498309672, 0.0005952711938519302,..."
73551,mkht8y,Mod drama leaves you nervous?,Release the tension with some bubble wrap 🤗\n\...,2021-04-05 10:49:06,6,5,0.994512,0.00061,"[0.0006097910194207467, 0.0006097707529695353,..."
50626,matzwx,"I can't do DD, so here's some bubble wrap to r...",And one gme share :p\n\n>!pop!< >!pop!< >!pop!...,2021-03-22 18:09:03,6,0,0.99441,0.000621,"[0.0006211275066162883, 0.0006211239055944567,..."
17759,lv2jb1,Diamantenhände 💎👐 German market is open! 🇩🇪,"Good morning, evening, night everybody!\nLet's...",2021-03-01 08:57:06,6,7,0.990321,0.001076,"[0.0010755178657565728, 0.0010753094869559426,..."
14811,lss7ee,Diamantenhände 💎👐 German market is open 🇩🇪,I will try to update you until the US premarke...,2021-02-26 08:58:26,6,7,0.990215,0.001087,"[0.0010872708206868976, 0.0010870314581921897,..."


In [178]:
#topics_5[1] = "MOD (moderator) Announcements & Related Posts"

{0: 'Gamestop as a Business/Store',
 1: 'MOD (moderator) Announcements',
 2: 'The GME Short Squeeze',
 3: 'Unclear Topic - Some posts about holding, FINRA, and a lot of external links',
 4: 'Brokerage Accounts',
 5: 'Regulatory Matters (Direct and Indirect)',
 6: 'Mix - GME Price Movements and Nonsense Posts',
 7: "Unity Amoung 'Apes'",
 8: 'General Posts/Knowledge About Understanding Stocks',
 9: 'Meme & Photo Posts'}

In [179]:
topics_5

{0: 'Gamestop as a Business/Store',
 1: 'MOD (moderator) Announcements',
 2: 'The GME Short Squeeze',
 3: 'Unclear Topic - Some posts about holding, FINRA, and a lot of external links',
 4: 'Brokerage Accounts',
 5: 'Regulatory Matters (Direct and Indirect)',
 6: 'Mix - GME Price Movements and Nonsense Posts',
 7: "Unity Amoung 'Apes'",
 8: 'General Posts/Knowledge About Understanding Stocks',
 9: 'Meme & Photo Posts'}

In [180]:
for i, key in enumerate(topics_5.keys()):
    top = topics_5[key]
    cohe = cohe_scores[1][i]
    avg = df_5[df_5['prime_topic'] == key]['prime_score'].mean()
    count = df_5[df_5['prime_topic'] == key]['prime_score'].count()
    print("Topic ", key, " is assigned to ", count, " documents - ", top)
    print("Avg. probability: ", avg, ", Coherence: ", cohe, "\n")

Topic  0  is assigned to  18939  documents -  Gamestop as a Business/Store
Avg. probability:  0.6787104416858851 , Coherence:  -1.3215131953607304 

Topic  1  is assigned to  1675  documents -  MOD (moderator) Announcements
Avg. probability:  0.6583250734008729 , Coherence:  -2.8277339562557864 

Topic  2  is assigned to  16475  documents -  The GME Short Squeeze
Avg. probability:  0.6575262822715986 , Coherence:  -1.0282691580559715 

Topic  3  is assigned to  2738  documents -  Unclear Topic - Some posts about holding, FINRA, and a lot of external links
Avg. probability:  0.7002950393364135 , Coherence:  -2.860302209654857 

Topic  4  is assigned to  8159  documents -  Brokerage Accounts
Avg. probability:  0.6888710644153886 , Coherence:  -1.6172970296768714 

Topic  5  is assigned to  5104  documents -  Regulatory Matters (Direct and Indirect)
Avg. probability:  0.6141212755824617 , Coherence:  -1.7615869762856624 

Topic  6  is assigned to  2196  documents -  Mix - GME Price Moveme

In [187]:
df_5['topic'] = df_5['prime_topic'].map(topics_5)

In [193]:
df_5 = df_5[['id', 'datetime', 'topic', 'prime_topic', 'sec_topic',
       'prime_score', 'sec_score', 'topic_scores']]
df_5.to_csv('df_test_train_split.csv', index = False)


In [195]:
df_train = df_5[:85329]
df_test = df_5[85329:]
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')

In [196]:
display(df_train.tail())
display(df_test.head())


Unnamed: 0,id,datetime,topic,prime_topic,sec_topic,prime_score,sec_score,topic_scores
85324,nome3s,2021-05-30 22:35:42,Unity Amoung 'Apes',7,0,0.980848,0.002128,"[0.002128304684863511, 0.002127781471764919, 0..."
85325,nommqz,2021-05-30 22:48:48,Gamestop as a Business/Store,0,2,0.751451,0.174012,"[0.7514508411864471, 0.002564312278887224, 0.1..."
85326,non7oj,2021-05-30 23:21:44,Brokerage Accounts,4,2,0.949991,0.005558,"[0.005556259124428676, 0.005555875059221204, 0..."
85327,noni9a,2021-05-30 23:38:37,The GME Short Squeeze,2,4,0.819958,0.020016,"[0.02000356492351113, 0.02000177475707879, 0.8..."
85328,nonse9,2021-05-30 23:55:19,Gamestop as a Business/Store,0,4,0.635887,0.297435,"[0.6358869132371086, 0.0083337992761728, 0.008..."


Unnamed: 0,id,datetime,topic,prime_topic,sec_topic,prime_score,sec_score,topic_scores
85329,noowbp,2021-05-31 01:11:56,Gamestop as a Business/Store,0,5,0.769623,0.165582,"[0.7696227622257193, 0.0032261357315304722, 0...."
85330,nooak2,2021-05-31 01:37:25,Unity Amoung 'Apes',7,5,0.681424,0.189137,"[0.00019807799508087112, 0.04658532680763216, ..."
85331,noqwau,2021-05-31 02:45:04,Unity Amoung 'Apes',7,0,0.918168,0.009095,"[0.009095278092563842, 0.009092895037566811, 0..."
85332,noqrj7,2021-05-31 02:51:04,Gamestop as a Business/Store,0,1,0.460921,0.340859,"[0.46092098211601934, 0.3408594296444331, 0.00..."
85333,not55z,2021-05-31 04:07:41,Regulatory Matters (Direct and Indirect),5,4,0.251515,0.208806,"[0.00017246204644892862, 0.015948126777382807,..."


Index(['id', 'title', 'selftext', 'datetime', 'prime_topic', 'sec_topic',
       'prime_score', 'sec_score', 'topic_scores', 'topic'],
      dtype='object')