In [143]:
import os
import csv
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

In [194]:
subreddit, year = 'Depression_Help', 2020

In [195]:
df = pd.read_csv(f'{subreddit}_data/raw_data_{year}/01Jan{year}.csv').drop(['Unnamed: 0'], axis=1)
for file in tqdm(os.listdir(f'{subreddit}_data/raw_data_{year}')):
    if file[-3:] == 'csv':
        df_new = pd.read_csv(f'{subreddit}_data/raw_data_{year}/{file}').drop(['Unnamed: 0'], axis=1)
        df_new.time = df_new.time.apply(lambda x: file[:-4])
        df = pd.concat([df, df_new]).reset_index(drop=True)
df = df[~df['text'].isin(['[deleted]', '[removed]'])]
df.head()

100%|██████████| 183/183 [00:00<00:00, 245.06it/s]


Unnamed: 0,text,time,score
0,I’d like some advice. One of my best friends i...,01Jan2020,1
1,i desperately need help.\n\nhow do i feel bett...,01Jan2020,1
2,"I'm not at any urgent risk, I just feel like I...",01Jan2020,1
3,For the past year I’ve had thoughts of death a...,01Jan2020,1
4,My girlfriend for 7 months broke up with me an...,01Jan2020,1


## Data Cleaning

In [196]:
stopeng = stopwords.words('english')
ps = PorterStemmer()

def clean_data(x):
    # convert to lowercase
    x = x.lower()
    
    # tokenize sentence
    x = word_tokenize(x)
#     x = x.split(' ')
    
    # remove stop words
    x = [word for word in x if word not in stopeng]

    # remove punctuation
    x = [word for word in x if any(letter.isalnum() for letter in word)]
    
    # stemming
#     x = [ps.stem(word) for word in x] 
    
    # rejoin text
    x = ' '.join(x)
    
    return x

In [197]:
df.text = df.text.apply(clean_data)

#### Comparison: Clean vs. Raw Data

In [198]:
# clean data
df.text.iloc[0]

'like advice one best friends 10 states away struggling depression suicidal thoughts want helpful show love tough far away texting/calling advice help thought sending gift cards days struggles get bed worry seem insensitive do/avoid thanks'

In [199]:
# raw data
pd.read_csv(f'{subreddit}_data/raw_data_{year}/01Jan{year}.csv').text.iloc[0]

'I’d like some advice. One of my best friends is 10 states away and struggling with depression and suicidal thoughts. I want to be helpful and show love but it’s tough to be so far away. \n\nOther than texting/calling more do you have any advice on how I can help? \n\nI thought of sending gift cards but some days he struggles to get out of bed and I worry that will seem insensitive. \n\nWhat should I do/avoid doing? \n\nThanks'

## Latent Dirichlet Allocation 

In [200]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis

In [201]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [202]:
# Tweak the two parameters below
num_topics = 6
num_words = 12
stopeng.extend(['like', "'s", "'re", '\n'])

count_vectorizer = CountVectorizer(stop_words=stopeng)
count_data = count_vectorizer.fit_transform(df.text)
    
lda = LDA(n_components=num_topics, random_state=42, n_jobs=-1)
lda.fit(count_data)
    
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
pyLDAvis.save_html(LDAvis_prepared, f'figures/LDA/{subreddit}_{year}_LDA.html')
pyLDAvis.display(LDAvis_prepared)

In [203]:
# Print the topics found by the LDA model
print("\nTopics found via LDA - self text =============================")
print_topics(lda, count_vectorizer, num_words)



Topic 1:
know feel want get really time would even friends one life years

Topic 2:
im dont want help know need feel talk really get someone cant

Topic 3:
depression https would know com help anxiety get people www amp could

Topic 4:
feel life even know want people things ca one never anything time

Topic 5:
depression get feel time know really help things work anxiety years need

Topic 6:
get job life people going work time school help college really good
