**Import libraries and packages**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

**Read the data from the file**

In [2]:
### read data ###
data = pd.read_csv('depression.csv')

One example of how a tweet looks like:

In [3]:
data.text[4444]

"New research out finds that #acupuncture reduces #depression &amp; enhances SSRI performance https://t.co/TIBa8O94vm \r\r\nWe've always known how beneficial it is, so its good to have the evidence to back it up. Acupuncture can really help!\r\r\n\r\r\n#mentalhealth https://t.co/OtkjCKTKX4"

A sneak-peek at the data:

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_ID,followers_count,user_location,user_description,date,text,hashtags
0,0,1390286708755808256,53,"Barcelona, Spain","For 34 years, Psych Congress has served as a u...",2021-10-27 11:45:02,Extended abstract submission deadline is TODAY...,
1,1,2350451402,30626,"Sacramento, California",Believer in creating a healthy game plan to fe...,2021-10-27 11:39:41,"üåüThis Fall and if you can, intentionally spend...",['anxiety']
2,2,3345539973,829,"Pittsburgh, PA","Founder Motivation Champs, Publisher üìö, Screen...",2021-10-27 11:38:22,Proud to welcome Stephanie Kunkel to the Motiv...,
3,3,11740932,3089,"Toronto, Canada",Self healing journey from ‚úùÔ∏è trauma to spiritu...,2021-10-27 11:35:33,How much energy do you put into keeping up app...,['travelblogger']
4,4,4008473487,417,"London, England",rTMS treatment across the UK and Ireland.\r\r\...,2021-10-27 11:35:03,Did you know that we treat seasonal affective ...,"['seasonalaffectivedisorder', 'SAD']"


**Data Cleaning**

1. Remove links
2. Remove '@' and usernames
3. Remove retweets
4. Remove english words that do not contribute to the meaning of the sentence (and, or, while, etc.)
5. Remove words that appear in almost every tweet ('depression', 'depress', etc.)
6. Remove punctuation
7. Remove double-spacing
8. Remove numbers
9. Keep the stem of the words (i.e depressive, depression, depressing, depressed - will all be recognized as depression)
10. Check for words that appear together

In [5]:
### data cleaning ###
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    tweet = re.sub ('#', '', tweet)
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
repeated_words = ['','depress', '#depression', '#depressio', '&amp', 'amp']
my_stopwords.extend(repeated_words)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`‚Äô{|}~‚Ä¢@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

Save the original tweets + the cleaned up tweets.

In [6]:
data['clean_tweet'] = data.text.apply(clean_tweet, True)

In [7]:
data = data.drop(columns="Unnamed: 0")

Turn words into vectors to allow machine process algorithms to process it.

In [8]:
### Vectorize ###

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(data['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

Apply the machine learning algorithm (NMF) to find the main 5 topics in all tweets:

In [9]:
### Apply NMF topic modeling ###
number_of_topics = 5
model_nmf = NMF(n_components=number_of_topics, random_state=0, alpha=.1, l1_ratio=.5)
model_nmf.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=5, random_state=0)

In [10]:
### show topic modeling ###
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        # topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        # for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

Show what words appear most often in the 5 classified topics:

In [11]:
no_top_words =  10
display_topics(model_nmf, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 1 words,Topic 2 words,Topic 3 words,Topic 4 words
0,il,acupunctur,mn,anxieti,live
1,depressio,mentalhealth,anxieti,skype,quot
2,trauma,help,mentalhealthmatt,onlinetherapi,podcast
3,chang,find,disord,therapist,lgbtq
4,affect,new,studi,onlinecounsel,nba
5,post,good,new,onlinepsychotherapi,radio
6,head,realli,pandem,onlin,iheart
7,loss,research,world,mentalhealth,kyri
8,associ,back,increas,therapi,freepj
9,traumat,alway,million,see,irv


From here, we infer the name of the topic/category based on the 7 most frequent words that the algorithm has identified.

- **Topic 0 (Experience/Symptom)**: Given its most frequent words, like il (most likely a root for the word illness), affect, change, etc., it seems to be describing the symptoms or experience of depression. 
- **Topic 1 (Treatment tips)**: Given the words like find, good, acupuncture, etc., this topic seems to be describing the treatment options or treatment advice for depression.
- **Topic 2 (News/Publications)**: Given the words like study, new, world, etc., this topic seems to be describing the news or publications about depression.
- **Topic 3 ((Online) Counseling)**: Given the words like online, counseling, therapist, etc., this topic seems to be describing therapy/couseling option, especially online.
- **Topic 4 (Irrelevant)**: This topic seems to be gathering a lot of irrelevant keywords, which we will categorize as simply 'irrelevant'.

In [12]:
# category 0: Feeling/Symptom/Experience
# category 1: Finding treatment
# category 2: News/Publications
# category 3: Online Counseling
# category 4: Irrelevant

We add the labels of the topics in our original data:

In [13]:
doc_topic = model_nmf.transform(tf)

In [14]:
cat_name = ['Experience/Symptom', 'Treatment tips', 'News/Publications', '(Online) Counseling', 'Irrelevant']

In [15]:
classes = []
categories = []
for n in range(doc_topic.shape[0]):
    topic_doc = doc_topic[n].argmax()
    classes.append(topic_doc)
    categories.append(cat_name[topic_doc])
    # print("Document", n+1, "Topic", topic_doc)

In [16]:
data['category'] = categories
data['class'] = classes

In [17]:
data[['user_description', 'text', 'hashtags', 'clean_tweet', 'category', 'class']].head()

Unnamed: 0,user_description,text,hashtags,clean_tweet,category,class
0,"For 34 years, Psych Congress has served as a u...",Extended abstract submission deadline is TODAY...,,extend abstract submiss deadlin today‚ùó add fin...,News/Publications,2
1,Believer in creating a healthy game plan to fe...,"üåüThis Fall and if you can, intentionally spend...",['anxiety'],üåüthi fall intent spend time natur ‚ñ™Ô∏èit help de...,(Online) Counseling,3
2,"Founder Motivation Champs, Publisher üìö, Screen...",Proud to welcome Stephanie Kunkel to the Motiv...,,proud welcom stephani kunkel motiv champ famil...,News/Publications,2
3,Self healing journey from ‚úùÔ∏è trauma to spiritu...,How much energy do you put into keeping up app...,['travelblogger'],much energi put keep appear ‚Å† se jump ‚Å† travel...,(Online) Counseling,3
4,rTMS treatment across the UK and Ireland.\r\r\...,Did you know that we treat seasonal affective ...,"['seasonalaffectivedisorder', 'SAD']",know treat season affect disord seasonalaffect...,News/Publications,2


**An example of Categorization in topics**

We select a random tweet: namely, tweet 5478.

In [18]:
data['text'][5478]

'Talk to an Online Therapist for help with #Addiction. To recover fully you must heal the underlying emotional pain that fuels your addiction. See: https://t.co/ga8WHFMkiw #addictionrecovery #AddictionTreatment #alcoholism #depression'

Its category has been selected as (Online) Counseling, which seems to be adequate given the tweet content:

In [19]:
data['category'][5478]

'(Online) Counseling'