**Import libraries and packages**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

**Read the data from the file**

In [2]:
### read data ###
data = pd.read_csv('data.csv')

One example of how a tweet looks like:

In [3]:
data.text[789]

'Suffering from Obsessive thinking around your health? I recommend Online Mindfulness Therapy via Skype for Health Anxiety and Hypochondria. Visit: https://t.co/a34MmnaE1i #OCD #anxiety #panicattacks #OnlineTherapistForOCD #intrusivethoughts'

A sneak-peek at the data:

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_ID,followers_count,user_location,user_description,date,text,hashtags
0,0,829318199468490752,294,Eora Nation,PhD candidate in gender and cultural studies @...,2021-10-27 11:59:28,New research out finds that #acupuncture reduc...,
1,1,415086641,432,London,Childrenâ€™s Book Illustrator ðŸŽ¨ & former @pathwa...,2021-10-27 11:51:28,New research out finds that #acupuncture reduc...,['ocd']
2,2,1311981838026657792,17,,One of the greatest regrets in life is being w...,2021-10-27 11:26:01,New research out finds that #acupuncture reduc...,['oknottobeok']
3,3,89962967,29198,"Boulder CO, USA",Peter Strong - Psychotherapist specializing in...,2021-10-27 11:25:00,New research out finds that #acupuncture reduc...,['OCD']
4,4,885311783795212288,981,Hell,A free thinker and freelance writer\r\r\n\r\r\...,2021-10-27 11:06:55,New research out finds that #acupuncture reduc...,


In [5]:
data.shape # number of datapoints x number of columns

(30398, 8)

**Data Cleaning**

1. Remove links
2. Remove '@' and usernames
3. Remove retweets
4. Remove english words that do not contribute to the meaning of the sentence (and, or, while, etc.)
5. Remove words that appear in almost every tweet ('ocd').
6. Remove punctuation
7. Remove double-spacing
8. Remove numbers
9. Keep the stem of the words (i.e therapeutic, therapy, therapist, therapies - will all be recognized as therapy)
10. Check for words that appear together

In [6]:
### data cleaning ###
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    tweet = re.sub ('#', '', tweet)
    return tweet

def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)

my_stopwords = nltk.corpus.stopwords.words('english')
repeated_words = ['first','last', 'make', '#ocd', '&amp', 'amp', 'non', 'ever', 
                  'got', 'via', 'best', 'row', 'know', 'want', 'depress', '#depression',
                  '#depressio', 'ever', 'start', 'pre', 'add', 'adhd', 'may', 'heard',
                  'know', 'often', 'would','end', 'might', 'xd', 'go', 'wait', 'especially', 
                  'part','current', 'entire', 'think', 'never','listen']
my_stopwords.extend(repeated_words)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`â€™{|}~â€¢@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet = remove_emojis(tweet)
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

Save the original tweets + the cleaned up tweets.

In [7]:
data['clean_tweet'] = data.text.apply(clean_tweet, True)

In [8]:
data = data.drop(columns="Unnamed: 0")

Turn words into vectors to allow machine process algorithms to process it.

In [9]:
### Vectorize ###

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(data['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

Apply the machine learning algorithm (NMF) to find the main 5 topics in all tweets:

In [10]:
### Apply NMF topic modeling ###
number_of_topics = 10
model_nmf = NMF(n_components=number_of_topics, random_state=0, alpha=.1, l1_ratio=.5)
model_nmf.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=10, random_state=0)

In [11]:
### show topic modeling ###
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        # topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        # for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

Show what words appear most often in the 5 classified topics:

In [12]:
no_top_words =  8
display_topics(model_nmf, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 1 words,Topic 2 words,Topic 3 words,Topic 4 words,Topic 5 words,Topic 6 words,Topic 7 words,Topic 8 words,Topic 9 words
0,il,acupunctur,mn,sleep,depress,depress,learn,ocd,diagnosi,feel
1,depress,depress,depress,parent,anxieti,live,brain,onlin,women,life
2,depressio,mentalhealth,anxieti,find,skype,podcast,adhdawar,mind,decad,diagnos
3,trauma,help,disord,import,onlinetherapi,quot,meet,therapi,childhood,thank
4,post,realli,mentalhealthmatt,start,therapist,lgbtq,teamadhd,anxieti,adhdawarenessmonth,listen
5,chang,new,new,chang,onlinecounsel,nba,frontal,see,treatment,current
6,affect,good,studi,ask,onlinepsychotherapi,radio,lifeinadhd,ocdrecoveri,sign,valid
7,head,find,world,exercis,mentalhealth,kyri,vortex,contact,disord,kitchen
8,loss,research,increas,matter,health,irv,adhdawarenessmonth,overcom,autism,contributor
9,associ,back,pandem,question,help,freepj,planner,mentalhealth,neurodevelopment,mourn


From here, we infer the name of the topic/category based on the 7 most frequent words that the algorithm has identified.

### NO NEW INSIGHTS, THUS USELESS!