In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans

from nltk.corpus import stopwords, wordnet

import pandas as pd
import numpy as np

import contractions
import string
import re

## Preprocessing

In [2]:
twitter_df = pd.read_csv('../data/sentiment_tweets3.csv')
twitter_df.rename(columns = {'message to examine': 'text', 'label (depression result)': 'target'}, inplace = True)

In [3]:
twitter_df['depressed'] = np.where(twitter_df['target'] == 1, 'positive', 'negative')

In [4]:
twitter_df.depressed.value_counts(normalize=True)

negative    0.775645
positive    0.224355
Name: depressed, dtype: float64

In [5]:
punc_and_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
replace_amps = lambda x: re.sub(r'&amp;', 'and', x)
fix_contraction = lambda x: contractions.fix(x)

In [6]:
# remove links in text
twitter_df['text_without_links'] = twitter_df.text.map(fix_contraction).replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True).replace(r'pic.twitter\S+', '', regex=True)

In [7]:
# Remove html ampersands
twitter_df.text_without_links = twitter_df.text_without_links.map(replace_amps)

In [8]:
# Remove Mentions and strip extra spaces
twitter_df.text_without_links = twitter_df.text_without_links.replace(r'@\S+', '', regex=True).str.strip()

In [9]:
twitter_df['text_clean'] = twitter_df.text_without_links.map(punc_and_lower)

In [10]:
twitter_df = twitter_df[twitter_df.text_clean != ' ']
twitter_df

Unnamed: 0,Index,text,target,depressed,text_without_links,text_clean
0,106,just had a real good moment. i missssssssss hi...,0,negative,just had a real good moment. i missssssssss hi...,just had a real good moment i missssssssss hi...
1,217,is reading manga http://plurk.com/p/mzp1e,0,negative,is reading manga,is reading manga
3,288,@lapcat Need to send 'em to my accountant tomo...,0,negative,Need to send them to my accountant tomorrow. ...,need to send them to my accountant tomorrow ...
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0,negative,ADD ME ON MYSPACE!!! myspace.com/LookThunder,add me on myspace myspace com lookthunder
5,624,so sleepy. good times tonight though,0,negative,so sleepy. good times tonight though,so sleepy good times tonight though
...,...,...,...,...,...,...
10309,802309,No Depression by G Herbo is my mood from now o...,1,positive,No Depression by G Herbo is my mood from now o...,no depression by g herbo is my mood from now o...
10310,802310,What do you do when depression succumbs the br...,1,positive,What do you do when depression succumbs the br...,what do you do when depression succumbs the br...
10311,802311,Ketamine Nasal Spray Shows Promise Against Dep...,1,positive,Ketamine Nasal Spray Shows Promise Against Dep...,ketamine nasal spray shows promise against dep...
10312,802312,dont mistake a bad day with depression! everyo...,1,positive,do not mistake a bad day with depression! ever...,do not mistake a bad day with depression ever...


In [11]:
# define stop words
stop_words = ENGLISH_STOP_WORDS.union(['tweet']).union(['twitter']).union(['tweeting']).union(stopwords.words('english'))

In [12]:
# for i in np.arange(0.0001, 0.001, 0.00005):
#     vectorizer = TfidfVectorizer(min_df=i)
#     vectorizer.fit(X_train)
#     print(f'min_df of {i} - {len(vectorizer.vocabulary_)}')

In [13]:
vectorizer = TfidfVectorizer(min_df=.00015)
vectorizer.fit_transform(twitter_df.text_clean)
len(vectorizer.vocabulary_)

6071

In [14]:
# for i in np.arange(0.1, 0.001, -0.001):
#     vectorizer = TfidfVectorizer(max_df=i, min_df=.00015)
#     vectorizer.fit(X_train)
#     print(f'max_df of {i} - {len(vectorizer.vocabulary_)}')

In [15]:
vectorizer = TfidfVectorizer(min_df=.00015, max_df=.002, stop_words=stop_words, ngram_range=(1,2))
tfidf = vectorizer.fit_transform(twitter_df.text_clean)
len(vectorizer.vocabulary_)

8999

## Splitting Dataset

In [16]:
# Split the data into X and y data sets
X = twitter_df.text_clean
y = twitter_df.depressed

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf, y, test_size=0.2, random_state=42)

In [17]:
X_train

<8250x8999 sparse matrix of type '<class 'numpy.float64'>'
	with 29148 stored elements in Compressed Sparse Row format>

## NMF

In [18]:
corpus = twitter_df['text_clean'].fillna(value='')

In [19]:
tweet_word_matrix = vectorizer.fit_transform(corpus)
vocab = vectorizer.get_feature_names()

In [20]:
nmf = NMF(n_components=10)
nmf.fit(tweet_word_matrix)



NMF(n_components=10)

#### Tweet/Topic Matrix

In [21]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)

In [22]:
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')
tweet_topic_matrix_df[['raw_tweets', 'clean_tweets']] = twitter_df[['text', 'text_without_links']]
tweet_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,raw_tweets,clean_tweets
0,2.964109e-73,0.0,1.691991e-07,1.1e-05,7e-05,5.9e-05,0.000128,2.5e-05,0.000381,2.5e-05,just had a real good moment. i missssssssss hi...,just had a real good moment. i missssssssss hi...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,is reading manga http://plurk.com/p/mzp1e,is reading manga
2,0.0,0.0,4.583727e-05,0.000231,0.008414,8e-05,7.3e-05,3.7e-05,0.002048,0.000258,,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@lapcat Need to send 'em to my accountant tomo...,Need to send them to my accountant tomorrow. ...
4,6.397429e-73,1.940263e-07,1.693954e-05,0.000392,0.00074,0.001042,0.005609,0.000869,0.003225,0.000831,ADD ME ON MYSPACE!!! myspace.com/LookThunder,ADD ME ON MYSPACE!!! myspace.com/LookThunder


#### Word/Topic Matrix

In [23]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
00,5.919006e-74,0.0,0.0002527802,0.000278,0.007205,0.000845,0.001123,0.000271,0.002511,0.000545
000,2.5264560000000003e-75,1.777343e-08,3.146217e-05,0.000192,0.000321,0.000214,0.000418,0.000107,0.001406,0.000654
00pm,0.0,0.0,6.053372e-05,0.000228,0.005968,0.000304,0.0,9.5e-05,0.000236,4.5e-05
03,8.649428e-74,0.0,1.606757e-07,2.2e-05,9e-06,6.4e-05,4e-05,1.4e-05,0.000107,2.5e-05
04,7.139604e-76,0.0,6.224868e-05,0.00029,0.001692,0.000402,0.001366,0.001097,0.001814,0.000228


## Topic Interpretation


In [24]:
def top_tweets(tweet_topic_matrix_df, topic, n_tweets):
    return (tweet_topic_matrix_df.sort_values(by=topic, ascending=False).head(n_tweets)['clean_tweets'].values)

In [25]:
def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df.sort_values(by=topic, ascending=False).head(n_words))[topic]

In [26]:
def check_topics():
    for val in range(tweet_topic_matrix.shape[1]):
        print(top_words(word_topic_matrix_df, f'topic_{val}', 20))

In [27]:
def describe_topic(num):
    print(top_words(word_topic_matrix_df, f'topic_{num}', 10), '\n')

    for tweet in top_tweets(tweet_topic_matrix_df, f'topic_{num}', 40):
        print(tweet)

    print('-----------------------------','\n')

In [28]:
for val in range(2):
    describe_topic(val)

train pay        9.457416e-01
add train        9.457416e-01
using add        9.457416e-01
100 followers    9.457416e-01
pay vip          9.457416e-01
sister           3.647684e-70
mum              6.081316e-71
watchin          5.303918e-71
spending         4.694313e-71
hang             4.481862e-71
Name: topic_0, dtype: float64 

I agree! I am sick of hackers everywhere
must not chase the boys is playing on my ipod right now. no joke.  play was the besttt
just saw hangover funniest movie i have seen in awhile! highly recommened
Yay I got a PT on my Scholar.
have amber give you a little massage...she is taking after her mom
I was just statin facts
Yeah a got quite a bit .. just need a few more bits for the evening like jewellery oh and a need a bag to
wooo hooooo..... Golden Tequila rocks!!!   n so does amrutanjal!!!
going to ikea
Hope Kelly gets better
That album is the business.
- a this pic is cute
Swing Night was great! hahaha  its my last swing night!!  :'(
For with God nothing sha

In [29]:
for val in range(2,4):
    describe_topic(val)

puff                 0.833174
puff cannabis        0.765754
ease                 0.737033
cannabis ease        0.700253
ease depression      0.700253
depression stress    0.476321
stress anxiety       0.437861
anxiety sun          0.181241
easeâ depression     0.140483
cannabis easeâ       0.140483
Name: topic_2, dtype: float64 

#PhysicalActivity protects against #depression onset:
i want to make fun of locals for always talking about post concert depression..... but that shit............. is actually so real
She definitely showed signs of alcoholism, but it was potentially triggered by her suffering from Postpartum Depression.
Exercise lowers risk of depression risk at all ages, researchers find   â¦
Depression and sore throat, halaloya.
Just one puff of cannabis 'could ease depression, stress and anxiety' - The Sun
Depression is no jokePlease help those in need   â¦
Anyways got a date with my baby to watch Infinity War tonight! leggo! kick that depression ass!
math is 60% the beca

In [30]:
for val in range(4,6):
    describe_topic(val)

exercising              0.582118
cuts risk               0.559528
cuts                    0.553408
16                      0.548010
regularly               0.506914
regularly cuts          0.502395
exercising regularly    0.502395
16 study                0.502395
depression 16           0.502395
study suggests          0.460450
Name: topic_4, dtype: float64 

Nobody with depression wants to be depressed and nobody just tunes out what others offer for help, but it is so much harder for them to use the help from being in the state of mind that they are in.   â¦
i need my depression to chill a bit because i havet done any art in over a month wtf
My review for Avengers: Infinity War10/10 epic fight.10/10 great cinematography.10/10 bop sound effects.10/10 great slight comedy.10/10 best one from Marvel.10/10 will not watch it again, prob, myb, bcs for me it is 10/10 traumatic and 10/10 gave me temporary depression :)
i got a mattress topper and now my crippling depression is not the *only* 

In [31]:
for val in range(6,8):
    describe_topic(val)

bored                1.935830
text                 0.279624
crap                 0.272418
number               0.216472
ages                 0.155342
updates              0.152869
wat                  0.144121
need know            0.129620
poppin               0.122123
depression making    0.120407
Name: topic_6, dtype: float64 

yep! that is what i was thinking! good choice! he can promote it, then!
Cocktails + balcony  +scorching day = heaven
haii you go to germany?
does Jackie  have a passport? may i  borrow her?  need to lose those last stubborn 50 pounds of baby weight--
just posted a new blog @  - check it out for news about a big wedding coming up  -- and sales!! GO GO GO!
Well I am glad you are sis persuaded you otherwise. Welcome aboard
Charlie Bucket Leisure is napping...of course.
Hey cashier is cool, I love fastfood! Haha. Any discount para yo? XD. And yes, they better release ZG before the summer tour!
This lovely morning = X-Men  and earring making marathon. Fun, fun, fun

In [32]:
for val in range(8,10):
    describe_topic(val)

loudly                0.835146
emoji loudly          0.832513
loudly crying         0.832513
emoji heavy           0.328914
heavy red             0.319106
red heart             0.319106
agree                 0.281954
heart emoji           0.211757
concert depression    0.178005
post concert          0.170374
Name: topic_8, dtype: float64 

Had a nice enough start to my day and then BAM, my depression has decided it is time for a reminder of how little I am worth and why I should just give up <Emoji: Pensive face>
i use medical marijuana.  i have bipolar disorder.  i still take all my psych meds because cannabis does not address bipolar disorder or depression.  your nephew needs a clue.
2 National headlining Acts Next month..I wish I could go tell my 7th grade self that all the bullying, the sleepless nights the depression etc.  was all worth it and that it gets easier and the growth has been incredible My life is a fucking blessing but NOTHING comes overnight
Fuck depression, it should

## K-Means Clustering

In [33]:
from sklearn.metrics.pairwise import euclidean_distances, pairwise_distances_argmin_min

In [34]:
num_clusters = 10
km = KMeans(n_clusters = num_clusters, random_state = 10, n_init = 10) # n_init, number of times the K-mean algorithm will run
km.fit(tweet_topic_matrix)

closest, prob = pairwise_distances_argmin_min(km.cluster_centers_, tweet_topic_matrix)

for idx in closest:
    print(tweet_topic_matrix_df.iloc[idx]['raw_tweets'])
    print(tweet_topic_matrix_df.iloc[idx])
    print()

Everybody should come out to drink today to drink 
topic_0                                                       0.0
topic_1                                                       0.0
topic_2                                                  0.000007
topic_3                                                  0.000365
topic_4                                                  0.000359
topic_5                                                  0.001126
topic_6                                                  0.000628
topic_7                                                  0.000757
topic_8                                                  0.002924
topic_9                                                    0.0014
raw_tweets      Everybody should come out to drink today to dr...
clean_tweets    Everybody should come out to drink today to drink
Name: 3092, dtype: object

RT @420weedin: Could 'one puff' of cannabis ease depression? #marijuana #cannabis http://bit.ly/2HNEtbSÂ 
topic_0                 

In [35]:
# for idx in closest:
#     print(idx, '-', tweet_topic_matrix_df.iloc[idx]['raw_tweets'])