**Import libraries and packages**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

**Read the data from the file**

In [2]:
### read data ###
data = pd.read_csv('adhd.csv')

One example of how a tweet looks like:

In [3]:
data.text[4444]

'Why women may wait decades for an #ADHD diagnosis https://t.co/nu3mAQfVU8'

A sneak-peek at the data:

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_ID,followers_count,user_location,user_description,date,text,hashtags
0,0,1365319367676878849,52,"Sheffield, England","❤Sal / UK\r\r\n🧡Player of games, Master of non...",2021-10-27 12:40:38,Saw GP today and we went through stuff and I t...,
1,1,53638786,1667,UK,27y/o nonbinary creator. They/Them. disabled. ...,2021-10-27 12:40:00,My ADHD Graveyard | Officially Diagnosed and A...,
2,2,1329360990765539329,12,"England, United Kingdom",38. A diagnosis referral is hard in the UK. Se...,2021-10-27 12:37:21,Have there been any studies of adults or child...,"['askADHD', 'ADHD']"
3,3,1242603133450293261,8,"West Hempstead, NY",OrganizeU4Life is making an impact on the live...,2021-10-27 12:35:03,Environmental factors don’t directly cause ADH...,
4,4,1429032961744281602,48,España,"🔞. Mainly NSFW. He/Him. 28. Gay, Single and Lo...",2021-10-27 12:33:08,Here in Spain we celebrate the national day of...,['ADHD']


**Data Cleaning**

1. Remove links
2. Remove '@' and usernames
3. Remove retweets
4. Remove english words that do not contribute to the meaning of the sentence (and, or, while, especially, etc.)
5. Remove words that appear in almost every tweet ('adhd', 'add', etc.)
6. Remove punctuation
7. Remove double-spacing
8. Remove numbers
9. Keep the stem of the words (i.e awareness, aware, - will all be recognized as 'awar')
10. Check for words that appear together

In [5]:
### data cleaning ###
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    tweet = re.sub ('#', '', tweet) # remove hashtags
    return tweet

def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)
my_stopwords = nltk.corpus.stopwords.words('english')
repeated_words = ['ever', 'start', 'pre', '&amp', 'amp', 
                  'add', 'adhd', 'may', 'heard', 'know', 
                  'often', 'would','end', 'might', 'xd', 
                  'go', 'wait', 'especially', 'part',
                  'current', 'entire', 'think', 'never',
                 'listen']
for i in repeated_words:
    my_stopwords.append(i)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`’{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet = remove_emojis(tweet)
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

Save the original tweets + the cleaned up tweets.

In [6]:
data['clean_tweet'] = data.text.apply(clean_tweet)

In [7]:
data = data.drop(columns="Unnamed: 0")

Turn words into vectors to allow machine process algorithms to process it.

In [8]:
### Vectorize ###

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(data['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

Apply the machine learning algorithm (NMF) to find the main 5 topics in all tweets:

In [9]:
### Apply NMF topic modeling ###
number_of_topics = 5
model_nmf = NMF(n_components=number_of_topics, random_state=0, alpha=.1, l1_ratio=.5)
model_nmf.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=5, random_state=0)

In [10]:
### show topic modeling ###
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        # topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        # for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

Show what words appear most often in the 5 classified topics:

In [11]:
no_top_words =  10
display_topics(model_nmf, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 1 words,Topic 2 words,Topic 3 words,Topic 4 words
0,parent,brain,diagnosi,feel,planner
1,sleep,learn,women,life,adhdawarenessmonth
2,import,adhdawar,decad,diagnos,adhdtwitt
3,find,meet,childhood,thank,mani
4,ask,teamadhd,disord,listen,adhdawar
5,chang,frontal,sign,current,go
6,question,lifeinadhd,neurodevelopment,valid,differ
7,exercis,vortex,treatment,kitchen,adhdperson
8,start,help,option,contributor,start
9,matter,disord,persist,bawl,alreadi


From here, we infer the name of the topic/category based on the 7 most frequent words that the algorithm has identified.

- **Topic 0 (Lifestyle/Factors)**: Given its most frequent words, like import (most likely a root for the word important), exercise, sleep, change, etc., it seems to be describing the contributing factors that we can change, or that impact ADHD. 
- **Topic 1 (Scientific Explanation)**: Given the words like brain, learn, frontal, etc., this topic seems to be describing the scientific explanations for ADHD.
- **Topic 2 (Diagnosis)**: Given the words like diagnosis, sign, neurodevelopment, etc., this topic seems to be describing the diagnosis of ADHD.
- **Topic 3 (Experience)**: Given the words like feel, current, thank, etc., this topic seems to be describing the momentarily.
- **Topic 4 (Awareness)**: This topic seems to be gathering a lot of awareness keywords, which we will categorize as simply 'awareness'.

In [12]:
# category 0: Lifestyle/Factors
# category 1: Scientific Explanation
# category 2: Diagnosis
# category 3: Experience
# category 4: Awareness

We add the labels of the topics in our original data:

In [13]:
doc_topic = model_nmf.transform(tf)

In [14]:
cat_name = ['Lifestyle/Factors', 'Scientific Explanation', 'Diagnosis', 'Experience', 'Awareness']

In [15]:
classes = []
categories = []
for n in range(doc_topic.shape[0]):
    topic_doc = doc_topic[n].argmax()
    classes.append(topic_doc)
    categories.append(cat_name[topic_doc])
    # print("Document", n+1, "Topic", topic_doc)

In [16]:
data['category'] = categories
data['class'] = classes

In [17]:
data[['user_description', 'text', 'hashtags', 'clean_tweet', 'category', 'class']].head()

Unnamed: 0,user_description,text,hashtags,clean_tweet,category,class
0,"❤Sal / UK\r\r\n🧡Player of games, Master of non...",Saw GP today and we went through stuff and I t...,,saw gp today went stuff tick box send referr t...,Diagnosis,2
1,27y/o nonbinary creator. They/Them. disabled. ...,My ADHD Graveyard | Officially Diagnosed and A...,,graveyard offici diagnos hobbi adhdawarenessmo...,Awareness,4
2,38. A diagnosis referral is hard in the UK. Se...,Have there been any studies of adults or child...,"['askADHD', 'ADHD']",studi adult children particip team sport work ...,Awareness,4
3,OrganizeU4Life is making an impact on the live...,Environmental factors don’t directly cause ADH...,,environment factor directli caus least natur a...,Awareness,4
4,"🔞. Mainly NSFW. He/Him. 28. Gay, Single and Lo...",Here in Spain we celebrate the national day of...,['ADHD'],spain celebr nation day peopl bad moment wish ...,Awareness,4


**An example of Categorization in topics**

We select a random tweet: namely, tweet 444.

In [18]:
data['text'][444]

"Really interesting discussion today. But very obvious that so much more needs to be done to raise awareness of #ADHD and the devastating effects it can have on people's Iives. Thank you @ADHDFoundation &amp; @djohnsonmsp https://t.co/SKZPY7KNXj"

Its category has been selected as Awareness, which seems to be adequate given the tweet content:

In [19]:
data['category'][444]

'Awareness'