In [3]:
import os
import csv
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_csv('raw_data_2017/01Jan2017.csv').drop(['Unnamed: 0'], axis=1)
for year in range(2017, 2021):
    for file in os.listdir(f'raw_data_{year}'):
        if file[-3:] == 'csv':
            df_new = pd.read_csv(f'raw_data_{year}/{file}').drop(['Unnamed: 0'], axis=1)
            df_new.time = df_new.time.apply(lambda x: file[:-4])
            df = pd.concat([df, df_new]).reset_index(drop=True)
df = df[df['text'] != '[deleted]']
df.head()

Unnamed: 0,text,time,score
0,"I don't know how many of you read it, but a fe...",01Jan2017,6
2,The reason that it occurred to me that I may h...,01Jan2017,1
3,Sometimes I don't really feel like a participa...,01Jan2017,5
6,"Hi, and thanks for reading this post. I'm look...",01Jan2017,1
8,"Loaded question I know, trying to scribble thi...",01Jan2017,7


## Data Cleaning

In [4]:
stopeng = stopwords.words('english')
ps = PorterStemmer()

def clean_data(x):
    # convert to lowercase
    x = x.lower()
    
    # tokenize sentence
    x = word_tokenize(x)
#     x = x.split(' ')
    
    # remove stop words
    x = [word for word in x if word not in stopeng]

    # remove punctuation
    x = [word for word in x if any(letter.isalnum() for letter in word)]
    
    # stemming
#     x = [ps.stem(word) for word in x] 
    
    # rejoin text
    x = ' '.join(x)
    
    return x

In [5]:
df = df.apply(clean_data)

100%|██████████| 24/24 [03:18<00:00,  8.26s/it]


#### Comparison: Clean vs. Raw Data

In [7]:
# clean data
data[0].head()

0    everyone I know is either struggling themself ...
1    So I wanna know if the guy who raped me was an...
2                                            [removed]
3    I have come to the conclusion I have way to hi...
4    So I’ve gone to my parents about anxiety and a...
Name: text, dtype: object

In [8]:
# raw data
pd.read_csv(f'raw_data/Feb1.csv').head()['text']

0    everyone I know is either struggling themself ...
1    So I wanna know if the guy who raped me was an...
2                                            [removed]
3    I have come to the conclusion I have way to hi...
4    So I’ve gone to my parents about anxiety and a...
Name: text, dtype: object

## Latent Dirichlet Allocation 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis

In [10]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

  and should_run_async(code)


In [15]:
# Tweak the two parameters below
num_topics = 6
num_words = 12
stopeng.extend(['like'])

for df, date in tqdm(zip(data, file_names), total=24):
    count_vectorizer = CountVectorizer(stop_words=stopeng)
    count_data = count_vectorizer.fit_transform(df)
    
    lda = LDA(n_components=num_topics, random_state=42, n_jobs=-1)
    lda.fit(count_data)
    
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
    pyLDAvis.save_html(LDAvis_prepared, f'figures/LDA/{date}_LDA.html')
    pyLDAvis.display(LDAvis_prepared)

  and should_run_async(code)
100%|██████████| 24/24 [03:28<00:00,  8.70s/it]


In [16]:
# Print the topics found by the LDA model
print("\nTopics found via LDA - self text =============================")
print_topics(lda, count_vectorizer, num_words)



Topic 1:
feel people want know even life help time get need much think

Topic 2:
get time want one know feel never go life even years really

Topic 3:
feel know even really people life time want get help mental something

Topic 4:
feel know really get im even anxiety would want something things day

Topic 5:
https health mental com www youtube feeling time youtu sandy work us

Topic 6:
feel know time get people really would want life think things always


  and should_run_async(code)
