<a href="https://colab.research.google.com/github/abhilashhn1993/Sentiment_Analysis_of_Tweets/blob/master/AnchoredCorexApproach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install corextopic

In [0]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
import io
df_pre = pd.read_csv(io.BytesIO(uploaded['cleanedPreTweets.csv']))
df_post = pd.read_csv(io.BytesIO(uploaded['cleanedPostTweets.csv']))

In [0]:
df_pre.head()

In [0]:
df_pre = df_pre.dropna()
df_post = df_post.dropna()

**PTSD Pre diagnosis Topics**

**Vectorizing the texts**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)

In [0]:
vectorizer = vectorizer.fit(df_pre['Tweets'])
tfidf = vectorizer.transform(df_pre['Tweets'])
vocab = vectorizer.get_feature_names()
print(len(vocab))

In [0]:
from corextopic import corextopic as ct

In [0]:
#Unsupervised Anchor corex
anchors = []
pre_model = ct.Corex(n_hidden=4, seed=42)
pre_model = pre_model.fit(
    tfidf,
    words=vocab
)

In [81]:
for i, topic_ngrams in enumerate(pre_model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: disabled, folk, disabled people, game, issue, straw, accessibility, disability, mental, community
Topic #2: people, say, feel, friend, never, talk, good friend, always, tell, shit
Topic #3: time, bad, sleep, home, next, wake, hour, away, forward, week
Topic #4: use, think, work, lot, take, start, care, way, also, may


In [0]:
#Supervised Corex with Anchors provided
anchors1 = [
           ["trauma","accident","flashbacks","abuse","assault","disabled","break up","sexual assault","harassment","torture","war","battle"], #CAUSES
           ["sad","alone","bad","hurt","feel","pain","scared","angry","fuck","shit","yell","lost"], #FEELINGS AND EMOTIONS
           ["nightmare","sleep","nervous","tired","fatigue","disorder","eating","nausea","hunger","lathargic"], #PHYSICAL SYMPTOMS 
           ["depressed","anxiety","insomnia","stress","anxious","cry","suicide","no mood"] #MENTAL SYMPTOMS
]
pre_model2 = ct.Corex(n_hidden=4, seed=42)
pre_model2 = pre_model2.fit(
    tfidf,
    words=vocab,
    anchors=anchors1,
    anchor_strength=5
)

In [79]:
for i, topic_ngrams in enumerate(pre_model2.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: disabled, abuse, people, trauma, assault, folk, disabled people, war, issue, sexual assault
Topic #2: feel, bad, shit, fuck, pain, sad, hurt, alone, angry, scared
Topic #3: sleep, tired, disorder, nervous, nightmare, fall asleep, asleep, eat disorder, fall, want sleep
Topic #4: cry, anxiety, stress, suicide, anxious, depressed, good friend, friend, illness, good


**Creating Dataframe with Topics as features**

In [0]:
topic_df = pd.DataFrame(
    pre_model2.transform(tfidf), columns=["topic_{}".format(i+1) for i in range(4)]
    ).astype(float)

In [0]:
topic_df.index = df_pre.index
df = pd.concat([df_pre, topic_df], axis=1)
df = df.drop(df.columns[0], axis=1)
df.sample(10, random_state=123)

In [0]:
df.to_csv('pre_diagnosis_topicModeled.csv')

**PTSD Post diagnosis Topics**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)

In [0]:
vectorizer2 = vectorizer2.fit(df_post['Tweets'])
tfidf_post = vectorizer2.transform(df_post['Tweets'])
vocab_post = vectorizer2.get_feature_names()

In [0]:
anchors_post = []
post_model = ct.Corex(n_hidden=5, seed=42)
post_model = post_model.fit(
    tfidf_post,
    words=vocab_post
)

In [80]:
for i, topic_ngrams in enumerate(post_model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: people, think, take, could, try, accessibility, folk, tell, next, want
Topic #2: time, feel, tonight, work, first time, first, ill, home, week, long
Topic #3: good, game, disabled, forward, good friend, online, hard, look forward, surf, friend
Topic #4: shit, ass, look, fuck, girl, old, talk, little, dumb, hair
Topic #5: video, song, music, add, ticket, free


In [0]:
#Giving Anchors
anchors2 = [
           ["no sleep","flashbacks","tired","nightmare","anxiety","chronic","fatigue","hunger","disorder","lathargic","no mood"], #SYMPTOMS
           ["sad","lonely","bad","hurt","feel","anger","pain","fuck","shit","lost","unhappy","angry","cry"], #FEELINGS AND EMOTIONS
           ["therapy","community","doctor","support","session","drug","pills","mental health","care","medication","anti-depressants","celexa","Prazosin"], #THERAPY AND SUPPORT
           ["exercise","meditate","art","feel better","yoga","work out","gym","healthy","food","diet","lose weight","work","music"], #COPING MECHANISMS             
]

In [0]:
post_model2 = ct.Corex(n_hidden=4, seed=42)
post_model2 = post_model2.fit(
    tfidf,
    words=vocab,
    anchors=anchors2,
    anchor_strength=5
)

In [82]:
for i, topic_ngrams in enumerate(post_model2.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: tired, anxiety, chronic, disorder, time, good friend, forward, good, look forward, first time
Topic #2: feel, bad, shit, fuck, cry, pain, sad, hurt, angry, lonely
Topic #3: care, support, doctor, community, therapy, people, drug, disabled, mental health, folk
Topic #4: work, food, music, art, healthy, lose weight, feel better, exercise, diet, gym


**Creating Dataframe with Topics as features**

In [0]:
topic_df2 = pd.DataFrame(
    post_model2.transform(tfidf), columns=["topic_{}".format(i+1) for i in range(4)]
    ).astype(float)

In [0]:
df2 = pd.concat([df_post, topic_df2], axis=1)
df2 = df2.drop(df2.columns[0], axis=1)
df2.sample(10, random_state=123)

In [0]:
df2.to_csv('post_diagnosis_topicModeled.csv') 

In [0]:
from google.colab import files
files.download('pre_diagnosis_topicModeled.csv')
files.download('post_diagnosis_topicModeled.csv')