<a href="https://colab.research.google.com/github/abhilashhn1993/Sentiment_Analysis_of_Tweets/blob/master/AnchoredCorexApproach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install spacy
!pip install pyLDAvis
!pip install corextopic

In [0]:
import re
import numpy as np
import pandas as pd
import spacy
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
import io
df_pre = pd.read_csv(io.BytesIO(uploaded['cleanedPreTweets.csv']))
df_post = pd.read_csv(io.BytesIO(uploaded['cleanedPostTweets.csv']))

In [0]:
df_pre.head()

In [0]:
df_pre = df_pre.dropna()
df_post = df_post.dropna()

**PTSD Pre diagnosis Topics**

**Vectorizing the texts**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)

In [0]:
vectorizer = vectorizer.fit(df_pre['Tweets'])
tfidf = vectorizer.transform(df_pre['Tweets'])
vocab = vectorizer.get_feature_names()
print(len(vocab))

In [0]:
from corextopic import corextopic as ct

In [0]:
#Unsupervised Anchor corex
anchors = []
pre_model = ct.Corex(n_hidden=4, seed=42)
pre_model = pre_model.fit(
    tfidf,
    words=vocab
)

In [90]:
for i, topic_ngrams in enumerate(pre_model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: people, disabled, say, folk, disabled people, think, straw, travel, mile, understand
Topic #2: time, game, work, take, issue, mental, lot, first, forward, accessibility
Topic #3: feel, chroniclife, bad, pain, good friend, sleep, morning, friend, good, hurt
Topic #4: new, write, ill, week, read, next, book, hopefully, sure, show


In [0]:
#Supervised Corex with Anchors provided
anchors1 = [
           ["trauma","accident","assault","disabled"], #CAUSES
           ["sad","alone","bad","hurt","feel","good","pain"], #FEELINGS AND EMOTIONS
           ["anxiety", "nightmare","sleep","nervous", "tired"], #SYMPTOMS BOTH PHYSICAL & MENTAL
           ["depressed","fuck","shit","lost","mental","die"] #BEHAVIOUR & STATE OF MIND
]
pre_model2 = ct.Corex(n_hidden=4, seed=42)
pre_model2 = pre_model2.fit(
    tfidf,
    words=vocab,
    anchors=anchors1,
    anchor_strength=5
)

In [92]:
for i, topic_ngrams in enumerate(pre_model2.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: disabled, people, folk, assault, trauma, disabled people, issue, say, accessibility, think
Topic #2: feel, good, bad, pain, sad, hurt, alone, good friend, feel bad, look good
Topic #3: sleep, tired, anxiety, nervous, nightmare, look forward, forward, social, fall asleep, social medium
Topic #4: shit, fuck, die, mental, depressed, illness, mental health, mental illness, health, feel shit


**Creating Dataframe with Topics as features**

In [0]:
topic_df = pd.DataFrame(
    pre_model2.transform(tfidf), columns=["topic_{}".format(i+1) for i in range(4)]
    ).astype(float)

In [0]:
topic_df.index = df_pre.index
df = pd.concat([df_pre, topic_df], axis=1)
df.sample(10, random_state=123)

In [0]:
df.to_csv('pre_diagnosis_topicModeled.csv') 

**PTSD Post diagnosis Topics**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)

In [0]:
vectorizer2 = vectorizer2.fit(df_post['Tweets'])
tfidf_post = vectorizer2.transform(df_post['Tweets'])
vocab_post = vectorizer2.get_feature_names()

In [0]:
anchors_post = []
post_model = ct.Corex(n_hidden=7, seed=42)
post_model = post_model.fit(
    tfidf_post,
    words=vocab_post
)

In [117]:
for i, topic_ngrams in enumerate(post_model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: accessibility, folk, could, mental, health, help, issue, also, community, medium
Topic #2: people, disabled, forward, next, video, look forward, surf, online, disabled people, surf online
Topic #3: feel, game, first, hard, try, work, first time, well, use, sheet
Topic #4: think, never, say, tell, want, shit, fuck, give, talk, girl
Topic #5: good, time, take, good friend, friend, hour, long, fall, good morning, asleep
Topic #6: home, eat, birthday, night, sleep, tomorrow, happy, tonight, happy birthday, come
Topic #7: life, abuse, ass, live, pain, whole, chronic, attention, reason, ptsd


In [0]:
#Giving Anchors
anchors2 = [
           ["no sleep","tired","nightmare","anxiety","chronic"], #SYMPTOMS
           ["sad","lonely","bad","hurt","feel","anger","pain"], #FEELINGS AND EMOTIONS
           ["drug","pill","antidepressant"], #MEDICATIONS AND DRUGS
           ["therapy","community","doctor","support","session"], #THERAPY AND SUPPORT
           ["help","need","want","love","share","talk","tell"], #CALL FOR HELP
           ["positive","motivate","glad","accept","self","better"] #SELF-IMPROVEMENT  
]

In [0]:
post_model2 = ct.Corex(n_hidden=6, seed=42)
post_model2 = post_model2.fit(
    tfidf,
    words=vocab,
    anchors=anchors2,
    anchor_strength=3
)

In [120]:
for i, topic_ngrams in enumerate(post_model2.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: tired, anxiety, chroniclife, chronic, mental, game, sleep, illness, nightmare, social
Topic #2: feel, bad, pain, sad, hurt, lonely, feel bad, anger, feel good, feel well
Topic #3: time, drug, friend, pill, take, good, say, never, good friend, forward
Topic #4: support, people, disabled, doctor, community, therapy, folk, disabled people, issue, accessibility
Topic #5: love, want, need, tell, help, talk, share, thank love, need new, people tell
Topic #6: better, self, glad, accept, positive, lose, come home, weight, wish could, lose weight


**Creating Dataframe with Topics as features**

In [0]:
topic_df2 = pd.DataFrame(
    post_model2.transform(tfidf), columns=["topic_{}".format(i+1) for i in range(6)]
    ).astype(float)

In [128]:
df2 = pd.concat([df_post, topic_df2], axis=1)
df2.sample(10, random_state=123)

Unnamed: 0.1,Unnamed: 0,Username,Tweets,Length of Tweet,date,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
23861,23862.0,ActiDermKimbo,ago tell,131.0,Tue Dec 13 02:02:01 +0000 2016,,,,,,
16000,16001.0,therapyhotline,kid drink water consume energy drink soda conc...,148.0,Fri Sep 07 16:00:20 +0000 2018,0.0,0.0,0.0,0.0,0.0,0.0
30376,30377.0,XoxoModel_B,officially daughter start preschool excited,98.0,Mon Oct 07 19:31:25 +0000 2019,,,,,,
16707,16708.0,sheeranazz,yess omg,8.0,Mon Jul 16 17:53:09 +0000 2018,0.0,0.0,0.0,0.0,0.0,0.0
4427,4428.0,traapcedes,love watch long,38.0,Mon Sep 02 22:32:44 +0000 2019,0.0,0.0,0.0,0.0,0.0,0.0
247,,,,,,0.0,0.0,0.0,0.0,0.0,0.0
13242,,,,,,0.0,0.0,0.0,0.0,0.0,0.0
15794,15795.0,therapyhotline,attracted guy look garden twin,63.0,Tue Oct 30 18:11:31 +0000 2018,0.0,0.0,0.0,0.0,0.0,0.0
8207,8208.0,TherealIntraKit,lithium dude talk,87.0,Tue Apr 09 08:57:52 +0000 2019,0.0,0.0,0.0,0.0,0.0,0.0
1932,1933.0,Daneuntamed,find,36.0,Mon Sep 02 00:11:01 +0000 2019,1.0,1.0,1.0,0.0,0.0,0.0


In [0]:
df2.to_csv('post_diagnosis_topicModeled.csv') 

In [0]:
from google.colab import files
files.download('pre_diagnosis_topicModeled.csv')
files.download('post_diagnosis_topicModeled.csv')