**Import libraries and packages**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

**Read the data from the file**

In [2]:
### read data ###
data = pd.read_csv('ocd.csv')

One example of how a tweet looks like:

In [3]:
data.text[1000]

'Admitting health anxiety https://t.co/0LJdlfdSjJ \r\r\n\r\r\n#factitiousdisorder #malingering #OCD #hypnotherapy #hypnotherapist  #hypnosis #hypnotist #cardiff https://t.co/3KORilvwlF'

A sneak-peek at the data:

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_ID,followers_count,user_location,user_description,date,text,hashtags
0,0,829318199468490752,294,Eora Nation,PhD candidate in gender and cultural studies @...,2021-10-27 11:59:28,New research out finds that #acupuncture reduc...,
1,1,415086641,432,London,Children’s Book Illustrator 🎨 & former @pathwa...,2021-10-27 11:51:28,New research out finds that #acupuncture reduc...,['ocd']
2,2,1311981838026657792,17,,One of the greatest regrets in life is being w...,2021-10-27 11:26:01,New research out finds that #acupuncture reduc...,['oknottobeok']
3,3,89962967,29198,"Boulder CO, USA",Peter Strong - Psychotherapist specializing in...,2021-10-27 11:25:00,New research out finds that #acupuncture reduc...,['OCD']
4,4,885311783795212288,981,Hell,A free thinker and freelance writer\r\r\n\r\r\...,2021-10-27 11:06:55,New research out finds that #acupuncture reduc...,


**Data Cleaning**

1. Remove links
2. Remove '@' and usernames
3. Remove retweets
4. Remove english words that do not contribute to the meaning of the sentence (and, or, while, etc.)
5. Remove words that appear in almost every tweet ('ocd').
6. Remove punctuation
7. Remove double-spacing
8. Remove numbers
9. Keep the stem of the words (i.e therapeutic, therapy, therapist, therapies - will all be recognized as therapy)
10. Check for words that appear together

In [5]:
### data cleaning ###
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    tweet = re.sub ('#', '', tweet)
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
repeated_words = ['first','last', 'make', '#ocd', '&amp', 'amp', 'non', 'ever', 
                  'got', 'via', 'best', 'row', 'know', 'want']
my_stopwords.extend(repeated_words)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`’{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

Save the original tweets + the cleaned up tweets.

In [6]:
data['clean_tweet'] = data.text.apply(clean_tweet, True)

In [7]:
data = data.drop(columns="Unnamed: 0")

Turn words into vectors to allow machine process algorithms to process it.

In [8]:
### Vectorize ###

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(data['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

Apply the machine learning algorithm (NMF) to find the main 5 topics in all tweets:

In [9]:
### Apply NMF topic modeling ###
number_of_topics = 5
model_nmf = NMF(n_components=number_of_topics, random_state=0, alpha=.1, l1_ratio=.5)
model_nmf.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=5, random_state=0)

In [10]:
### show topic modeling ###
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        # topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        # for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

Show what words appear most often in the 5 classified topics:

In [11]:
no_top_words =  10
display_topics(model_nmf, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 1 words,Topic 2 words,Topic 3 words,Topic 4 words
0,onlin,acupunctur,read,time,anxieti
1,mind,help,fiction,night,depress
2,therapi,mentalhealth,hard,manag,mentalhealth
3,see,new,die,issu,bipolar
4,skype,realli,hit,fuck,ptsd
5,ocdrecoveri,depress,stephen,attempt,bpd
6,contact,alway,pray,case,health
7,overcom,back,fabul,stay,adhd
8,ocdawar,find,tough,slight,❤️
9,therapist,research,gold,redo,add


From here, we infer the name of the topic/category based on the 7 most frequent words that the algorithm has identified.

- **Topic 0 (Counseling/Recovery)**: Given its most frequent words, like onlin, skype, therapy, recovery etc., it seems to be describing the counseling options or things related to OCD recovery. 
- **Topic 1 (News/Publication)**: Given the words like find, acupuncture, research, new etc., this topic seems to be describing new findings on the treatment of OCD.
- **Topic 2 (Experience)**: Given the words like manage, issue, attept, etc., this topic seems to be describing the struggles or experiences of OCD.
- **Topic 3 (Reading)**: Given the words like read, fiction, pray, etc., this topic seems to be talking about plots of books, or generally reading, which could also include religious reading.
- **Topic 4 (Co-ocurring Mental Disorders)**: This topic seems to be gathering a lot of mental disorders keywords, which means that it must be showing co-ocurring mental disorders.

In [12]:
# category 0: Counseling/Recovery
# category 1: News/Publication
# category 2: Experience
# category 3: Reading
# category 4: Co-ocurring Mental Disorders

We add the labels of the topics in our original data:

In [13]:
doc_topic = model_nmf.transform(tf)

In [14]:
cat_name = ['Counseling/Recovery',  'News/Publications', 'Experience', 'Reading', 'Co-ocurring Mental Disorders']

In [15]:
classes = []
categories = []
for n in range(doc_topic.shape[0]):
    topic_doc = doc_topic[n].argmax()
    classes.append(topic_doc)
    categories.append(cat_name[topic_doc])
    # print("Document", n+1, "Topic", topic_doc)

In [16]:
data['category'] = categories
data['class'] = classes

In [17]:
data[['user_description', 'text', 'hashtags', 'clean_tweet', 'category', 'class']].tail()

Unnamed: 0,user_description,text,hashtags,clean_tweet,category,class
2858,DBT/mindfulness-based counseling & education; ...,"#therapistsconnect colleagues and others, plea...","['therapistsconnect', 'ocd', 'bpd']",therapistsconnect colleagu other pleas note re...,Co-ocurring Mental Disorders,4
2859,Introverted Extrovert. Drama Teacher @perthaca...,This is on point. #OCD #Anxiety #Exposure #ERP...,"['OCD', 'Anxiety', 'Exposure', 'ERP']",point ocd anxieti exposur erp,Co-ocurring Mental Disorders,4
2860,"Rehabilitation center for #Mentalillness,#drug...",Nearly 2% of the population in the country suf...,,nearli popul countri suffer ocd commonli seen...,Counseling/Recovery,0
2861,"I’m not sarcastic, I’m British",I have to have my house super organized and cl...,"['OCD', 'perfectionist']",hous super organ clean die one judg 🤷🏼‍♀️ ocd ...,Experience,2
2862,AKA Shiftrox\r\r\nBLOG | LIFE | GAMES | FUN 🧠 ...,How Is The Proof Of Brain Community Changing Y...,"['hive', 'ecency', 'proofofbrain']",proof brain commun chang life hive ecenc proof...,Co-ocurring Mental Disorders,4


**An example of Categorization in topics**

We select a random tweet: namely, tweet 111.

In [18]:
data['text'][111]

"New research out finds that #acupuncture reduces #depression &amp; enhances SSRI performance https://t.co/TIBa8O94vm \r\r\nWe've always known how beneficial it is, so its good to have the evidence to back it up. Acupuncture can really help!\r\r\n\r\r\n#mentalhealth https://t.co/OtkjCKTKX4"

Its category has been selected as News/Publications, which seems to be adequate given the tweet content:

In [19]:
data['category'][111]

'News/Publications'