In [19]:
import sys
import os 
sys.path.append('../../../Mental_Disorder/3_feature_visualization') # get old tweets library
import age_gender_predictor
from collections import defaultdict
import re
from tabulate import tabulate
from datetime import datetime, timedelta
import math
from pymongo import MongoClient
import numpy as np
from scipy.stats import spearmanr
import pandas as pd

## Regular User

In [2]:
def getLangRatio(cursor):
    lang_ratios = {}
    for tweet in cursor:
        lang = 1 if tweet["lang"] == "en" else 0
        user_id = tweet["user"]["id"]
        if user_id in lang_ratios:
            lang_ratios[user_id].append(lang)
        else:
            lang_ratios[user_id] = [lang]
    for user_id, ratio in lang_ratios.items():
        lang_ratios[user_id] = np.sum(ratio) / len(ratio)
    return lang_ratios

def getUsersTweets(dbName,collectionName, en_threshold=0.9):
    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    lang_ratios = getLangRatio(cursor)

    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    usersTweets = {}
    for tweet in cursor:
        userID = tweet["user"]["id"]
        if lang_ratios[userID] < en_threshold:
            continue
        #Processing emotions from Carlos' API
        emotion =  tweet["emotion"]["groups"][0]["name"]
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]
            
        ambiguous = True if tweet['emotion']['ambiguous'] == 'yes' else False
       
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]    
        else:
            emotion_2 = None
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0

        date = tweet["created_at"]
        
        text = tweet['text']

        if userID not in usersTweets:
            usersTweets[userID] = {}
        if date not in usersTweets[userID]:
            usersTweets[userID][date] = {}
            
        usersTweets[userID][date]['text'] = text
        usersTweets[userID][date]['polarity'] =  polarity
        usersTweets[userID][date]['emotion'] =  emotion
        usersTweets[userID][date]['emotion_2'] =  emotion_2
        usersTweets[userID][date]['ambiguous'] =  ambiguous
    return usersTweets

def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())

def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=100):    #Spam and inactive user filter
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group

In [3]:
regular_tweets =  getUsersTweets("eric","regularUser_en_fixed_emotion")


In [4]:
regular_timeSeries = timeSeriesTransform(regular_tweets)
regular_clean = userFilter(regular_timeSeries)

In [5]:
regular_clean[0]

Unnamed: 0,polarity,text,emotion_2,emotion,ambiguous,dt
2012-12-29 18:20:56,0,Still pissed.😒,0,anger,False,82.100000
2012-12-29 19:43:02,0,I'm going to be the bigger person and not talk...,0,disgust,False,539.033333
2012-12-30 04:42:04,-1,Stressing too much..way too much.. 😳😥,fear,disgust,False,48.416667
2012-12-30 05:30:29,1,"Did you ever stop and think ""hey, by me sleepi...",disgust,sadness,False,1333.816667
2012-12-31 03:44:18,-1,I can't take this..,anger,sadness,False,182.216667
2012-12-31 06:46:31,0,Nothings easy anymore. You're either happy or ...,fear,sadness,False,647.016667
2012-12-31 17:33:32,-1,Not drinking tonight.. This will be hard as he...,disgust,anticipation,False,0.516667
2012-12-31 17:34:03,0,You ain't lying. Who does that shit?? #swervee...,0,disgust,False,39.016667
2012-12-31 18:13:04,1,"Party at Cj's tonight, if you need directions ...",fear,sadness,False,316.983333
2012-12-31 23:30:03,1,@iamkaitieb @rachelleanne195 Aweee I love you((:,trust,joy,False,1.783333


## BD

In [6]:
def loadTweets():
    # {username:{int(date):{[(datetime,content,sentiment),...]}}}
    tweets_dict = defaultdict(lambda: defaultdict(lambda:[]))
    with open('../organized/date_sentiment_tweets') as tweets:
        for line in tweets.readlines():
            username, date, datetime, content, sentiment = line.split('\t')
            tweets_dict[username][int(date)].append((datetime, content, sentiment))

    return tweets_dict

In [7]:
def TweetsFormating(tweets_dict, en_threshold=0.9):
    usersTweets = {}
    for user in tweets_dict:
        userID = user
        for date in tweets_dict[user]:
            for tweet_info in tweets_dict[user][date]:
                date, content, polarity = tweet_info
   
            
#         date = tweet["created_at"]
                date = datetime.strptime(str(date), "%Y-%m-%d %H:%M:%S")

                text = content

                if userID not in usersTweets:
                    usersTweets[userID] = {}
                if date not in usersTweets[userID]:
                    usersTweets[userID][date] = {}

                usersTweets[userID][date]['text'] = text
                usersTweets[userID][date]['polarity'] =  int(polarity.strip())
                usersTweets[userID][date]['emotion'] =  None
                usersTweets[userID][date]['emotion_2'] =  None
                usersTweets[userID][date]['ambiguous'] =  True
    return usersTweets

In [8]:
# {username:{int(date):{[(datetime,content,sentiment),...]}}}
bd_tweets_dict = loadTweets()

In [9]:
bd_tweets = TweetsFormating(bd_tweets_dict)


In [10]:
bd_timeSeries = timeSeriesTransform(bd_tweets)

In [11]:

bd_clean = userFilter(bd_timeSeries)

In [34]:
groups = [ regular_clean, bd_clean]
group_names = ["Regular", "Bipolar"]

# Positive/ Negative Ratio

In [42]:
def getFlipsCount(timeSeries, upperbound=60, lowerbound = 0):
    flips = getFlips(timeSeries)
    durations = getFlipsDuration(timeSeries, flips)
    return np.sum((durations > lowerbound) & (durations < upperbound) )



def getFlips(timeSeries, attribute= 'polarity'):
    flips = np.zeros(timeSeries.shape[0],dtype=bool)
    polarity = timeSeries[attribute].values[:-1]
    right_elements = timeSeries[attribute].values[1:]
    flips[:-1] = (polarity * right_elements) < 0
    return flips


def getFlipsDuration(timeSeries, flips):
    filtered_timeSeries = timeSeries['dt'][flips].index.values
    dt = np.zeros(filtered_timeSeries.shape[0],dtype=float)
    dt[:-1] = (filtered_timeSeries[1:] - filtered_timeSeries[:-1]).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return dt

def getCombosCount(timeSeries, matcher = -1, lowerbound = 2):
    combos = comboTracker(timeSeries)
    combos_count = sum([hit for element, hit in combos if element == matcher and hit > lowerbound])
    return combos_count

def comboTracker(timeSeries, attribute= "polarity", lowerbound = 120):
    array = timeSeries[attribute]
    starter = array[0]
    combo = 1
    result = []
    i = 0 
    for cursor in array[1:]:
        i += 1
        if starter == cursor and timeSeries["dt"][i-1] < lowerbound:
            combo += 1
        else:
            if combo > 1:
                result.append((starter, combo))
            starter = cursor
            combo = 1
    if combo > 1:
         result.append((starter, combo))
    return result

def getNegativeRatio(timeSeries):
    total_tweets = timeSeries.shape[0]
    return np.sum(timeSeries["polarity"].values == -1) / float(total_tweets)


def getPositiveRatio(timeSeries):
    total_tweets = timeSeries.shape[0]
    return np.sum(timeSeries["polarity"].values == 1) / float(total_tweets)





def getPolarity(group):
    polarity = {"flips":[],"negative_combos":[],"positive_combos":[], "positive_ratio":[], "negative_ratio":[]}
    for timeSeries in group:
        try:
            tweets_length = float(timeSeries.shape[0])
        except:
            print 'error'
            print timeSeries
            break
        flips_ratio = getFlipsCount(timeSeries) / tweets_length
        negative_combos_ratio = getCombosCount(timeSeries,matcher=-1) / tweets_length
        positive_combos_ratio = getCombosCount(timeSeries,matcher=1) / tweets_length
        positive_ratio = getPositiveRatio(timeSeries)
        negative_ratio = getNegativeRatio(timeSeries)
        
        polarity["flips"].append(flips_ratio)
        polarity["negative_combos"].append(negative_combos_ratio)
        polarity["positive_combos"].append(positive_combos_ratio)
        polarity["positive_ratio"].append(positive_ratio)
        polarity["negative_ratio"].append(negative_ratio)
        
    return polarity

In [48]:
def summaryTable(groups,names, method, style="default", tablefmt = "simple"):
    header = ["category"]
    group_counts = []
    base = method(groups[0])
    base_labels = [0] * len(groups[0])
    contents = []
    for name in names:
        header.append(name + " C")
        header.append(name + " P")
        
        
    for group in groups:
        group_counts.append(method(group))
        
        
        
    if style == "default":
        for category, base_count in base.items():
            content = [category]
            for g,group in enumerate(groups):

                labels = base_labels + ([1]*len(group))
                counts = base_count + group_counts[g][category]
                c, p = spearmanr(labels, counts)

                content.append(c)
                content.append(p)

            contents.append(content)
  
        contents = sorted(contents, key=lambda pair: abs(pair[1]), reverse=True)           
#        
        print(tabulate(contents, headers=header,floatfmt=".2f", tablefmt=tablefmt))
    
    return contents
    

In [49]:
summaryTable(groups, group_names, getPolarity)

headers = ["Group","Positive Ratio Mean","Positive Ratio STD", "Negative Ratio Mean", "Negative Ratio STD"]
contents = []
for i, group in enumerate(groups):
    group_name = group_names[i]
   
    positive_ratios = [getPositiveRatio(timeSeries) for timeSeries in group]
    negative_ratios = [getNegativeRatio(timeSeries) for timeSeries in group]
    positive_ratio_mean = np.mean(positive_ratios)
    negative_ratio_mean = np.mean(negative_ratios)
    positive_ratio_std = np.std(positive_ratios)
    negative_ratio_std = np.std(negative_ratios)

    contents.append([group_name, positive_ratio_mean, positive_ratio_std, negative_ratio_mean, negative_ratio_std])
print("\n=========\n")
print(tabulate(contents, headers=headers))

category           Regular C    Regular P    Bipolar C    Bipolar P
---------------  -----------  -----------  -----------  -----------
negative_combos         0.00         1.00         0.42         0.00
positive_ratio          0.00         1.00        -0.44         0.00
flips                   0.00         1.00         0.12         0.00
positive_combos         0.00         1.00        -0.22         0.00
negative_ratio          0.00         1.00         0.25         0.00


Group      Positive Ratio Mean    Positive Ratio STD    Negative Ratio Mean    Negative Ratio STD
-------  ---------------------  --------------------  ---------------------  --------------------
Regular               0.230047             0.141908                0.10991              0.0701825
Bipolar               0.102002             0.0552154               0.158715             0.0690471
