In [15]:
import sys
import os 
sys.path.append('../../../Mental_Disorder/3_feature_visualization') # get old tweets library
import age_gender_predictor
from collections import defaultdict
import re
from tabulate import tabulate
from datetime import datetime, timedelta
import math
from pymongo import MongoClient
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

## Regular User

In [2]:
def getLangRatio(cursor):
    lang_ratios = {}
    for tweet in cursor:
        lang = 1 if tweet["lang"] == "en" else 0
        user_id = tweet["user"]["id"]
        if user_id in lang_ratios:
            lang_ratios[user_id].append(lang)
        else:
            lang_ratios[user_id] = [lang]
    for user_id, ratio in lang_ratios.items():
        lang_ratios[user_id] = np.sum(ratio) / len(ratio)
    return lang_ratios

def getUsersTweets(dbName,collectionName, en_threshold=0.9):
    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    lang_ratios = getLangRatio(cursor)

    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    usersTweets = {}
    for tweet in cursor:
        userID = tweet["user"]["id"]
        if lang_ratios[userID] < en_threshold:
            continue
        #Processing emotions from Carlos' API
        emotion =  tweet["emotion"]["groups"][0]["name"]
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]
            
        ambiguous = True if tweet['emotion']['ambiguous'] == 'yes' else False
       
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]    
        else:
            emotion_2 = None
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0
   
            
        date = tweet["created_at"]
        text = tweet['text']

        if userID not in usersTweets:
            usersTweets[userID] = {}
        if date not in usersTweets[userID]:
            usersTweets[userID][date] = {}
            
        usersTweets[userID][date]['text'] = text
        usersTweets[userID][date]['polarity'] =  polarity
        usersTweets[userID][date]['emotion'] =  emotion
        usersTweets[userID][date]['emotion_2'] =  emotion_2
        usersTweets[userID][date]['ambiguous'] =  ambiguous
    return usersTweets

def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())

def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=100, time_filter = False):    #Spam and inactive user filter
#     to restrict date in latest 8 weeks
    if time_filter == True:
        temp_group=[]
        for timeSeries in group:
            eight_week_period = timeSeries.index[-1]- timedelta(weeks=8)
            temp_group.append(timeSeries[timeSeries.index > eight_week_period])
        group = temp_group
            
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group

In [3]:
regular_tweets =  getUsersTweets("eric","regularUser_en_fixed_emotion")
regular_timeSeries = timeSeriesTransform(regular_tweets)
regular_clean = userFilter(regular_timeSeries, time_filter = True)

## Bipolar User

In [4]:
def loadTweets():
    # {username:{int(date):{[(datetime,content,sentiment),...]}}}
    tweets_dict = defaultdict(lambda: defaultdict(lambda:[]))
    with open('../organized/date_sentiment_tweets') as tweets:
        for line in tweets.readlines():
            username, date, datetime, content, sentiment = line.split('\t')
            tweets_dict[username][int(date)].append((datetime, content, sentiment))

    return tweets_dict

def TweetsFormating(tweets_dict, en_threshold=0.9):
    usersTweets = {}
    for user in tweets_dict:
        userID = user
        for date in tweets_dict[user]:
            for tweet_info in tweets_dict[user][date]:
                date, content, polarity = tweet_info
   
            
#         date = tweet["created_at"]
                date = datetime.strptime(str(date), "%Y-%m-%d %H:%M:%S")

                text = content

                if userID not in usersTweets:
                    usersTweets[userID] = {}
                if date not in usersTweets[userID]:
                    usersTweets[userID][date] = {}

                usersTweets[userID][date]['text'] = text
                usersTweets[userID][date]['polarity'] =  int(polarity.strip())
                usersTweets[userID][date]['emotion'] =  None
                usersTweets[userID][date]['emotion_2'] =  None
                usersTweets[userID][date]['ambiguous'] =  True
    return usersTweets

In [5]:
# {username:{int(date):{[(datetime,content,sentiment),...]}}}
bd_tweets_dict = loadTweets()
bd_tweets = TweetsFormating(bd_tweets_dict)
bd_timeSeries = timeSeriesTransform(bd_tweets)
bd_clean = userFilter(bd_timeSeries)

# Daily Tweet Frequence (All & Late)

In [6]:
def getTweetRate(timeSeries):
    total_tweets = timeSeries.shape[0]
    delta_time = np.max(timeSeries.index.values) - np.min(timeSeries.index.values)
    totla_duration = (delta_time).astype('timedelta64[h]') / np.timedelta64(24, 'h')
    return total_tweets / float(totla_duration)



def thirdPronuonDetect(words, matcher=re.compile("@[a-z]+")):
    for word in words:
        if word == "@":
            continue
        elif matcher.search(word):
            return True
    return False

def seriesContains(timeSeries):
    match_function = np.vectorize(thirdPronuonDetect)
    return match_function(timeSeries["text"].str.lower().str.split().values)


# Mention Ratio

In [7]:
def getMentioRate(timeSeries):
    total_tweets = timeSeries.shape[0]
    total_mentions = np.sum(seriesContains(timeSeries))
    return total_mentions / float(total_tweets)

# Frequent Mentions

In [8]:
def getFrequentMentions(timeSeries, lowerbound = 3):
    total_tweets = timeSeries.shape[0]
    friends_mentions = {}
    texts = timeSeries["text"].values
    for text in texts:
        terms = text.strip().split()
        for word in terms:
            if word[0] == '@' and len(word) > 1:
                friends_mentions[word] = friends_mentions.get(word, 0) +1
    frequent_frients = [screen_name for screen_name, mentions in friends_mentions.items() if mentions >= lowerbound]
    return len(frequent_frients)
 

## Unique Mentions

In [9]:

def getUniqueMentions(timeSeries):
    total_tweets = timeSeries.shape[0]
    friends_set = set()
    texts = timeSeries["text"].values
    for text in texts:
        terms = text.strip().split()
        for word in terms:
            if word[0] == '@' and len(word) > 1:
                friends_set.add(word)
    return len(friends_set)

## Trigger

In [10]:
def getSocialFeature_group(group):
    social_features = {"tweets_rate": [],"mention_rate": [],"unique_mentions": [],"frequent_mentions": []}
    for timeSeries in group:
        social_features["tweets_rate"].append(getTweetRate(timeSeries))
        social_features["mention_rate"].append(getMentioRate(timeSeries))
        social_features["unique_mentions"].append(getUniqueMentions(timeSeries))
        social_features["frequent_mentions"].append(getFrequentMentions(timeSeries))
    return social_features

def summaryTable(groups,names, method, style="default", tablefmt = "simple"):
    header = ["category"]
    group_counts = []
    base = method(groups[0])
    base_labels = [0] * len(groups[0])
    contents = []
    for name in names:
        header.append(name + " C")
        header.append(name + " P")
        
        
    for group in groups:
        group_counts.append(method(group))
        
        
        
    if style == "default":
        for category, base_count in base.items():
            content = [category]
            for g,group in enumerate(groups):

                labels = base_labels + ([1]*len(group))
                counts = base_count + group_counts[g][category]
                c, p = spearmanr(labels, counts)

                content.append(c)
                content.append(p)

            contents.append(content)
  
        contents = sorted(contents, key=lambda pair: abs(pair[1]), reverse=True)           
#        
        print(tabulate(contents, headers=header,floatfmt=".2f", tablefmt=tablefmt))
    
    return contents
    

In [11]:
groups = [ regular_clean, bd_clean]
group_names = ["Regular", "Bipolar"]

In [16]:
header = ["tweets_rate", "mention_rate", "unique_mentions", "frequent_mentions"]
contents = []
for i, group in enumerate(groups):
   
    group_name = group_names[i]
    content = [group_name]
    tweets_rate = np.mean([getTweetRate(timeSeries) for timeSeries in group])
    mention_rate = np.mean([getMentioRate(timeSeries) for timeSeries in group])
    unique_mentions = np.mean([getUniqueMentions(timeSeries) for timeSeries in group])
    frequent_mention = np.mean([getFrequentMentions(timeSeries) for timSeries in group])    
    content += [tweets_rate, mention_rate, unique_mentions, frequent_mention]
    contents.append(content)
print("\n======Social Features======\n")


print("\n======Mean Distribution======\n")

print(tabulate(contents, headers=header,))
print("\n======Correlation======\n")
summaryTable(groups,group_names,getSocialFeature_group)





           tweets_rate    mention_rate    unique_mentions    frequent_mentions
-------  -------------  --------------  -----------------  -------------------
Regular        3.39794        0.428638            171.801                   14
Bipolar       13.4388         0.410792           1249.87                   248


category             Regular C    Regular P    Bipolar C    Bipolar P
-----------------  -----------  -----------  -----------  -----------
mention_rate              0.00         1.00        -0.02         0.58
tweets_rate               0.00         1.00         0.39         0.00
frequent_mentions         0.00         1.00         0.45         0.00
unique_mentions           0.00         1.00         0.46         0.00


[['mention_rate', 0.0, 1.0, -0.019033248434176153, 0.58330919427396521],
 ['tweets_rate', 0.0, 1.0, 0.38650308886553408, 4.5299133213256252e-31],
 ['frequent_mentions', 0.0, 1.0, 0.45455536064650964, 1.0393704974252236e-43],
 ['unique_mentions', 0.0, 1.0, 0.45595333329949439, 5.3175753971622844e-44]]

In [17]:
headers = [" ","Tweets Rate","Mention Rate", "Unique Mentions", "Frequent Mentions"]
contents = []
for i, timeSeries in enumerate(bd_clean):
    tweets_rate = getTweetRate(timeSeries)
    mention_rate = getMentioRate(timeSeries)
    unique_mentions = getUniqueMentions(timeSeries)
    frequent_mention = getFrequentMentions(timeSeries)
    content = [i, tweets_rate, mention_rate, unique_mentions, frequent_mention]
    contents.append(content)
print(tabulate(contents, headers=headers))

       Tweets Rate    Mention Rate    Unique Mentions    Frequent Mentions
---  -------------  --------------  -----------------  -------------------
  0       6.81164       0.2595                    238                  117
  1       7.0263        0.212496                  702                  161
  2       2.53227       0.323556                  299                  135
  3       1.46817       0.389901                   45                   15
  4      14.1067        0.839319                  529                  223
  5      22.4799        0.244232                  216                   20
  6       4.89036       0.263087                  357                  134
  7      19.396         0.228978                 1073                  455
  8       5.27919       0.390708                  597                  227
  9      11.039         0.759985                23868                 2727
 10      34.8176        0.490107                 4375                 1237
 11       2.15782       0