In [1]:
# imports
import json, re, time, unicodedata, unidecode, itertools, os
from collections import defaultdict
from datetime import date, datetime, timedelta
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from pattern.nl import sentiment, parse, split
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from pprint import pprint

In [2]:
# files
#MWE
#topics = json.load(open('D:\\4. Data\\Amazones_Forum_Export_JSON\\MWE_topic.json'))
#posts = json.load(open('D:\\4. Data\\Amazones_Forum_Export_JSON\\MWE.json'))
#regular
forums = json.load(open('D:\\4. Data\\Amazones_Forum_Export_JSON\\2017-12-07T13-35-45 _amazones_forums_export.json'))
# zorg voor een nieuwe versie van dit bestand; verkeerd opgeslagen dus je mist het kontje!
topics = json.load(open('D:\\4. Data\\Amazones_Forum_Export_JSON\\2017-12-07T13-36-51_amazones_forum_topics_export.json'))
posts = json.load(open('D:\\4. Data\\Amazones_Forum_Export_JSON\\2017-12-07T13-39-20_amazones_forum_posts_export.json'))
users = json.load(open('D:\\4. Data\\Amazones_Forum_Export_JSON\\2017-12-07T13-39-20_amazones_users_export.json'))

In [3]:
def remove_non_ascii(text):
    """ this function expects a string, and removes non-ascii characters from it """
    return unidecode.unidecode(text)

In [4]:
def cleanup(text):
    """ this function expects a string (post from the BVN/Amazones forum), and returns a cleaner version of it """
    # remove all links, images, quotes, and emailaddresses
    text=re.sub('<a.*?>(.*?)</a>','',text) #remove links
    text=re.sub('(http:|www)\S*','',text) #remove links without markup
    text=re.sub('\[\\\/url\]','',text)
    text=re.sub('<img.*?/>', '',text) #remove images
    text=re.sub('<div class="bb-quote">((\s|\S)*?)</div>','',text) #remove quotes
    text=re.sub('<script.*?>([\S\s]*?)</script>','',text) #remove emailaddresses

    # replace all emoticon-icons
    text=re.sub('<img.*?title="(.*?)".*?/>', '(EMO:\\1)',text) #replace emoticons by textual indicators 

    # replace (most) sideways latin emoticons
    text=re.sub('[^>]:-?(\)|\])','(EMO:smiley)',text)
    text=re.sub(u'☺️','(EMO:smiley)',text)
    text=re.sub('[^>]:-?(\(|\[)','(EMO:sad)',text)
    text=re.sub(';-?(\)|\])','(EMO:wink)',text)
    text=re.sub(r'(:|;|x|X)-?(D)+\b','(EMO:laugh)',text)
    text=re.sub(':-?(/|\\\|\|)','(EMO:frown)',text)
    text=re.sub(r'(:|;)-?(p|P)+\b','(EMO:cheeky)',text)
    text=re.sub('(:|;)(\'|\")-?(\(|\[)','(EMO:cry)',text)
    text=re.sub('\<3+','(EMO:heart)',text)
    text=re.sub(u'❤️','(EMO:heart)',text)
    text=re.sub('((\>:-?(\(|\]))|(\>?:-?@))','(EMO:angry)',text)
    text=re.sub('\>:-?(\)|\])','(EMO:evil)',text)
    text=re.sub(r'(:|;)-?(O|o|0)+\b','(EMO:shock)',text)
    text=re.sub('(:|;)-?(K|k|x|X)','(EMO:kiss)',text)
    # :s
    # :x is eigenlijk geen kus, geloof ik...

    #other important adjustments:
    text=re.sub('m\'?n\s','mijn ',text) # replacing m'n and mn with mijn, so it gets parsed correctly.
    text=re.sub('z\'?n\s','zijn ',text) #replacing z'n and zn with zijn
    text=re.sub('d\'?r\s','haar ',text) #replacing d'r and dr with zijn (only if followed by space, so dr. stays dr.)

    # replace all emoticons (and other things) written between double colons
    text=re.sub(':([a-zA-Z]+):','(EMO:\\1)',text)

    # remove remaining markup
    text=re.sub('</?(ol|style|b|p|em|u|i|strong|br|span|div|blockquote|li)(.*?)/?>','',text)
    text=re.sub('(\[|\]|\{|\})', '',text)

    # separate text from punctuation (may cause double/triple spaces - does not matter at this point)
    text = re.sub('(\.{2,}|/|\)|,|!|\?)','\\1 ',text) # space behind
    text=re.sub('(/|\()',' \\1',text) # space in front
    text=re.sub('(\w{2,})(\.|,)','\\1 \\2 ',text) #space 'between'

    return(remove_non_ascii(text))

In [5]:
def make_P_T_and_D(topics,posts):
    """ this function takes the .json files containing the thread starts and responses, and returns three things:
    [0]: a dictionary with the user-ID as key, and the post as value;
    [1]: a dictionary with the user-ID as key, and the time of posting as a value;
    [2]: a list of all datetimes present in the data (sorted by date, because the .json was already sorted) """
    P = defaultdict(list)
    T = defaultdict(list)
    D = []

    with tqdm(total=len(topics)) as pbar:
        for t in reversed(topics):
            pbar.update(1)
            P[t['Author uid']].append((cleanup(t["Body"]),1))
            T[t['Author uid']].append(t['Post date'])
            D.append(datetime.strptime(t['Post date'], '%d/%m/%Y - %H:%M'))

    with tqdm(total=len(posts)) as pbar:
        for p in reversed(posts):
            pbar.update(1)
            P[p['Auteur-uid']].append((cleanup(p["Body"]),0))
            T[p['Auteur-uid']].append(p['Datum van inzending'])
            D.append(datetime.strptime(p['Datum van inzending'], '%d/%m/%Y - %H:%M'))

    return (P,T,D)

In [6]:
def determine_active_users(include_only = []):
    global over_treshold
    
    if include_only == []:
        userlist = T
    else:
        userlist = include_only
    for user in userlist:
        if len(T[user])<30:
            pass
        else:
            over_treshold.append(user)

In [7]:
def make_binlist(D,timetick=1):
    """ this function takes a list of dates (D), and generates a new list of dates,
    starting at 4:00 AM just before the earliest date in D, and ending at 4:00 just after the latest date in D,
    with fixed timeticks between all dates in the list.
    Optionally, the length of the timetick may be specified (in hours).
    """
    lower = min(D)
    upper = max(D)

    if lower.time()>=datetime.strptime('4:00','%H:%M').time():
        lower = lower.replace(hour = 4, minute = 0)
    else:
        lower = (lower+timedelta(days = -1)).replace(hour=4,minute=0)

    if upper.time()<datetime.strptime('12:00','%H:%M').time():
        upper = upper.replace(hour = 4, minute = 0)
    else:
        upper = (upper+timedelta(days=1)).replace(hour=4,minute=0)

    return([lower + timedelta(hours=x) for x in range(0, 24*(upper-lower).days, timetick)])

In [8]:
# these bodies contain (EMO:...). Many emoticons are actually analysed by pattern's sentiment miner, so see which can be reverted (within-sentence) 
def determine_questionmarks(body, Q=0):
    """ This function counts and returns the number of sentences in the provided input string
    that ends in at least one question mark """
    for sentence in sent_tokenize(body):
        if re.search('\?+', sentence):
            Q+=1
    if len(sent_tokenize(body))!=0:
        return float(Q)/float(len(sent_tokenize(body)))
    else:
        return 0

def determine_sentiment(body):
    """ this funciton determines and returns the average sentiment of sentences in the provided input string.
    It uses the pattern module to do so. Sentiment values may range from -1 to 1. """
    return np.mean([sentiment(sentence)[0] for sentence in sent_tokenize(body)]) 

def determine_subjectivity(body):
    """ This function determines and returns the average subjectivity of sentences in the provided input string.
    It uses the pattern module to do so. Subjectivity values may range from 0 to 1. """
    return np.mean([sentiment(sentence)[1] for sentence in sent_tokenize(body)]) 

def determine_post_length(body):
    """ This function determines and returns the length of the provided input string in sentences.
    It uses the nltk sent_tokenize function to do so. """
    return(len(sent_tokenize(body)))

def determine_sentence_length(body):
    """ This function determines and returns the average length of the sentences in the provided input string in words.
    It uses the nltk word_tokenize function to do so. """
    #word_tokenize also considers interpunction a word
    return np.mean([len(word_tokenize(sentence)) for sentence in sent_tokenize(body)])

def determine_PRoPortion(body,prp = ['PRP','PRP$'],firstperson = ['ik', 'me', 'mij', 'mijn', 'we', 'wij', 'ons', 'onze']):
    """ This function determines and returns the percentage of personal and possessive pronouns
    that occurs in first person (e.g., 'ik', 'wij', 'onze'). It uses pattern's split and parse functions to do so """
    personalpronouns = []
    firstpersonalpronouns = []

    for sentence in split(parse(body)):
        for word in sentence:
            if word.tag in prp:
                personalpronouns.append(word.string)

    for word in personalpronouns:
        if word.lower() in firstperson:
            firstpersonalpronouns.append(word)
    if len(personalpronouns)==0:
        return 0
    else:
        return float(len(firstpersonalpronouns))/float(len(personalpronouns))
    
def determine_death(user,lower):
    if user in deathdict:
        return (deathdict[user]-lower).days
    else:
        return float('nan')

In [9]:
def determine_week_activity(first_date,last_date,bindict):
    """ This function returns a dictionary that has kept track of the activity in week-bins, instead of day-bins.
    It expects two dates, to indicate in between which dates the dictionary should be built,
    and expects a dictionary in which all the user's active times are already stored"""
    for d in range(0, (last_date-first_date).days,7):
        week_start = first_date+timedelta(days=d)
        week_end = first_date+timedelta(days=d+7)
        for date in list(itertools.chain.from_iterable(bindict.values())):
            if week_start<=datetime.strptime(date, '%d/%m/%Y - %H:%M')<week_end:
                weekcountdict[week_start,week_end].append(1)
        if len(weekcountdict[week_start,week_end])==0:
            weekcountdict[week_start,week_end] = []
    return weekcountdict

In [10]:
def determine_past_activity(bindict,index,hours_back=24):
    """ This function returns the number of times a user has been active in the last hours_back hours (default 24h).
    The first hours_back time bins will for now have a value of 0 by default, to keep things easy."""
    if 0<= index-hours_back<=len(bindict):
        past_activity = np.sum([len(bindict[binlist[index-(x+1)],binlist[index+1-(x+1)]]) for x in range(hours_back)])
    else:
        past_activity=0
    return past_activity

In [21]:
def make_deathdict():
    deathdict = dict()
    due_day = ["2/1/2018", "1/3/2016", "5/3/2015", "29/11/2013", "18/01/2018", "12/10/2016", "27/11/2016", "28/2/2015", "9/3/2012", "11/5/2016", "26/4/2016", "12/3/2004", "16/10/2009", "9/11/2009", "6/4/2015", "28/12/2012", "10/04/2014", "27/07/2014", "19/02/2013", "8/5/2014", "2/10/2013", "2/7/2013"]
    user_IDs = ["1845","917","902","2877","5572","4487","3124","905","1683","1143","2126","966","968","940","933","3808","1552","2389","2413","3211","1870","2287"]
    for i,d in enumerate(due_day):
         deathdict[user_IDs[i]] = datetime.strptime(d, '%d/%m/%Y')
    return deathdict

In [11]:
def print_information(user, T, first_date,last_date,weekcount):
    """ this function prints useful basic information on the user's activity.
    It needs quite some input so make sure you've got them all:
    1) user ID, 2) a dictionary containing users-IDs as key, and any activity log as value,
    3,4) the first and last date of activity, and 5) the dictionary that kept track of the week activity. """
    print "User:", user
    print "posted one or more posts in", len(T[user]), "'bins'." 
    print "The first post: ", first_date
    print "The last post: ", last_date
    print "Activity spread over: ", last_date-first_date
    print "The average nr of posts per week: ", np.mean([len(x) for x in weekcount.values()]), "including long times of inactivity."
    print "The average nr of posts in non-empty weeks: ", np.mean([len(x) for x in weekcount.values() if not x==[]])
    print "The range of activity: ", min([len(x) for x in weekcount.values()]), " to ", max([len(x) for x in weekcount.values()]), " posts per week"
    print

In [12]:
PTD = make_P_T_and_D(topics,posts) 
P = PTD[0]
T = PTD[1]
D = PTD[2]

A Jupyter Widget




A Jupyter Widget




In [22]:
# path for saving csv files
path = r"C:\Users\sternheimam\Desktop\my-notebook\user-csvs" 

#---------------------------
# global variables
#---------------------------
# chronological list of (lower) bin boundaries (default time tick = 1 hour)
binlist = make_binlist(D)
deathdict = make_deathdict()

# determine the negative difference for measuring the 'backtrack' feature
neg_diff = 24


#---------------------------
# determine 'relevant' users
#---------------------------
# initiate empty list of users relevant to measure
over_treshold = []
determine_active_users(deathdict) # for practical reasons, right now only user 1144 is selected. If more users desired, simply enter them in list form.

#---------------------------
# go through all users in over_treshold, and do stuff..
#---------------------------
# show the progress, while going through the active users
with tqdm(total=len(over_treshold)) as processbar:
    for user in over_treshold:
        print user,
        processbar.update(1)
        
        #---------------------------
        # initiate some user-specific variables
        #---------------------------
        first_date = 0
        last_date = 0
        inactivity = 0
        
        # all lists starting with csv_ are lists that will eventually contain all values that end up in the csv file
        csv_date = []            #datetime
        csv_sentiment = []       #sentiment value (-1 to 1)
        csv_questionmarks = []   #question mark-ending sentences (float)
        csv_subjectivity = []    #subjectivity value (0 to 1)
        csv_sentencelength = []  #length of sentence in words (float)
        csv_postlength = []      #length of post in sentences (float)
        csv_startposts = []      #1 for a thread start, 0 for a response (float) 
        csv_inactivity = []      #hours passed since last activity (int)
        csv_backtrack = []       #posts posted in last x hours (x = neg_diff; default 24h)
        csv_firstpersonalpronouns = [] #first-person personal or possessive pronouns (float)
        csv_death_in_x_days = []
        
        # dictionaries to keep track of activity within certain time bins
        bindict = defaultdict(list)        #bins with size 'timetick' (default 1h), values = post-times 
        postdict = defaultdict(list)       #bins with size 'timetick' (default 1h), values = posts
        metadict = defaultdict(list)       #bins with size 'timetick' (default 1h), values = 1 or 0 (start or response)
        weekcountdict = defaultdict(list)  #bins with size = 7 days, values = '1' for every post
        backtrackdict = defaultdict(list)  #bins with size = neg_diff (default 24h), values = nr of posts in last neg_diff hours

        # emoticons should be part of sentiment miner
        # TO DO: linguistic markers, like adjectives / pronouns, and the diversity of topics / vocabulary
        
        
        #---------------------------
        # loop through the (sorted) list of datetimes, and do stuff..
        #---------------------------
        for index,boundary in enumerate(tqdm(binlist)):
            # determine time bin boundaries for dictionaries
            if index+1>=len(binlist):
                break
            else:
                lower = binlist[index]
                upper = binlist[index+1]
                
                #---------------------------
                # Loop through T, collecting all activity for the selected user
                #---------------------------
                # determine in which time bin the user's activity belongs
                for time in T[user]:
                    if lower<=datetime.strptime(time, '%d/%m/%Y - %H:%M')<upper:
                        bindict[lower,upper].append(time)
                        
                        # and determine how active the user has been in past neg_diff hours
                        past_activity = determine_past_activity(bindict,index,neg_diff)
                        backtrackdict[lower,upper].append(past_activity)
                        
                        # determine the first and last active dates
                        if first_date == 0:
                            first_date = datetime.strptime(time, '%d/%m/%Y - %H:%M')
                            last_date = datetime.strptime(time, '%d/%m/%Y - %H:%M')
                        else:
                            last_date = datetime.strptime(time, '%d/%m/%Y - %H:%M')
                        
                        # split the text in P from the start/response-information
                        body = P[user][T[user].index(time)][0]
                        meta = P[user][T[user].index(time)][1]                        
                        postdict[lower,upper].append(body) 
                        metadict[lower,upper].append(meta)

                # fill up the still-empty places in the dictionary
                if len(bindict[lower,upper])==0:
                    bindict[lower,upper]=[]
                    postdict[lower,upper]=[]
                    metadict[lower,upper]=[]
                    backtrackdict[lower,upper]=[]
                
                #---------------------------
                # Fill csv_feature-lists with values 
                #---------------------------                
                # Treat different posts within same bin 'as one' (concatenate them)
                body = '. '.join(postdict[lower,upper]) #can be empty!
                
                # when then bin is empty, only add 1 to the inactivity feature
                if len(body) == 0:
                    inactivity+=1
                # when the bin is not empty, append a value to all csv_feature-lists (and reset 'inactivity')
                else:    
                    csv_date.append(lower)
                    csv_sentiment.append(determine_sentiment(body))
                    csv_questionmarks.append(determine_questionmarks(body))
                    csv_subjectivity.append(determine_subjectivity(body))
                    csv_sentencelength.append(determine_sentence_length(body))
                    csv_postlength.append(np.mean([determine_post_length(x) for x in postdict[lower,upper]]))
                    csv_startposts.append(np.mean(metadict[lower,upper]))
                    csv_inactivity.append(inactivity)
                    csv_backtrack.append(np.sum(backtrackdict[lower,upper][-1]))
                    csv_firstpersonalpronouns.append(determine_PRoPortion(body))
                    csv_death_in_x_days.append(determine_death(user,lower)) 
                    # only works now because all deathlist people are passed
                    inactivity = 0

        #---------------------------
        # Report the results for this user
        #--------------------------- 
        # determine average activity: over active period, and over only-active weeks 
        weekcount = determine_week_activity(first_date,last_date,bindict)   
        print_information(user, T, first_date,last_date,weekcount)            
        
        
        # put all csv_features into a dataframe
        df = pd.DataFrame({"Date & Time": csv_date, "Sentiment": csv_sentiment, "Questions": csv_questionmarks, 
                           "Subjectivity": csv_subjectivity, "Words/Sentence": csv_sentencelength, 
                           "Sentences/Post": csv_postlength, "First posts": csv_startposts,
                           "Inactivity": csv_inactivity, "Posts in last 24H": csv_backtrack,
                          "PRP1": csv_firstpersonalpronouns, "ETD": csv_death_in_x_days}).dropna()
        
        name = "features_user_"+str(user)+".csv"       
        df.to_csv(os.path.join(path,name),index=False)
print " _ "
print "|_|"

A Jupyter Widget

1143

A Jupyter Widget

Exception in thread Thread-9:
Traceback (most recent call last):
  File "C:\Users\sternheimam\AppData\Local\Continuum\anaconda2\lib\threading.py", line 801, in __bootstrap_inner
    self.run()
  File "C:\Users\sternheimam\AppData\Local\Continuum\anaconda2\lib\site-packages\tqdm\_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Users\sternheimam\AppData\Local\Continuum\anaconda2\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



 User: 1143
posted one or more posts in 76 'bins'.
The first post:  2005-08-13 11:40:00
The last post:  2015-03-30 09:41:00
Activity spread over:  3515 days, 22:01:00
The average nr of posts per week:  0.15109343936381708 including long times of inactivity.
The average nr of posts in non-empty weeks:  1.4074074074074074
The range of activity:  0  to  4  posts per week

3808

A Jupyter Widget

 User: 3808
posted one or more posts in 149 'bins'.
The first post:  2012-04-05 17:37:00
The last post:  2012-11-07 12:30:00
Activity spread over:  215 days, 18:53:00
The average nr of posts per week:  4.806451612903226 including long times of inactivity.
The average nr of posts in non-empty weeks:  6.478260869565218
The range of activity:  0  to  17  posts per week

940

A Jupyter Widget

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


 User: 940
posted one or more posts in 400 'bins'.
The first post:  2004-10-02 16:44:00
The last post:  2009-04-15 19:15:00
Activity spread over:  1656 days, 2:31:00
The average nr of posts per week:  1.6877637130801688 including long times of inactivity.
The average nr of posts in non-empty weeks:  2.8776978417266186
The range of activity:  0  to  13  posts per week

1870

A Jupyter Widget

 User: 1870
posted one or more posts in 114 'bins'.
The first post:  2007-05-29 23:34:00
The last post:  2011-12-21 17:35:00
Activity spread over:  1666 days, 18:01:00
The average nr of posts per week:  0.47478991596638653 including long times of inactivity.
The average nr of posts in non-empty weeks:  2.26
The range of activity:  0  to  7  posts per week

3211

A Jupyter Widget

 User: 3211
posted one or more posts in 68 'bins'.
The first post:  2011-04-19 22:09:00
The last post:  2014-04-15 08:45:00
Activity spread over:  1091 days, 10:36:00
The average nr of posts per week:  0.4358974358974359 including long times of inactivity.
The average nr of posts in non-empty weeks:  1.6585365853658536
The range of activity:  0  to  6  posts per week

2413

A Jupyter Widget

 User: 2413
posted one or more posts in 111 'bins'.
The first post:  2009-03-18 15:38:00
The last post:  2012-09-18 16:35:00
Activity spread over:  1280 days, 0:57:00
The average nr of posts per week:  0.6065573770491803 including long times of inactivity.
The average nr of posts in non-empty weeks:  2.1346153846153846
The range of activity:  0  to  14  posts per week

3124

A Jupyter Widget

 User: 3124
posted one or more posts in 88 'bins'.
The first post:  2011-02-20 16:39:00
The last post:  2016-04-30 11:21:00
Activity spread over:  1895 days, 18:42:00
The average nr of posts per week:  0.3247232472324723 including long times of inactivity.
The average nr of posts in non-empty weeks:  1.5714285714285714
The range of activity:  0  to  5  posts per week

2389

A Jupyter Widget

 User: 2389
posted one or more posts in 546 'bins'.
The first post:  2009-02-16 16:41:00
The last post:  2014-06-22 14:12:00
Activity spread over:  1951 days, 21:31:00
The average nr of posts per week:  1.956989247311828 including long times of inactivity.
The average nr of posts in non-empty weeks:  4.368
The range of activity:  0  to  21  posts per week

917

A Jupyter Widget

 User: 917
posted one or more posts in 884 'bins'.
The first post:  2004-09-05 13:49:00
The last post:  2016-02-11 12:56:00
Activity spread over:  4175 days, 23:07:00
The average nr of posts per week:  1.4807370184254607 including long times of inactivity.
The average nr of posts in non-empty weeks:  3.4
The range of activity:  0  to  19  posts per week

2877

A Jupyter Widget

 User: 2877
posted one or more posts in 407 'bins'.
The first post:  2010-10-09 21:11:00
The last post:  2013-10-29 10:50:00
Activity spread over:  1115 days, 13:39:00
The average nr of posts per week:  2.54375 including long times of inactivity.
The average nr of posts in non-empty weeks:  3.803738317757009
The range of activity:  0  to  19  posts per week

2287

A Jupyter Widget

 User: 2287
posted one or more posts in 145 'bins'.
The first post:  2008-10-29 12:23:00
The last post:  2012-03-25 10:16:00
Activity spread over:  1242 days, 21:53:00
The average nr of posts per week:  0.8146067415730337 including long times of inactivity.
The average nr of posts in non-empty weeks:  2.3015873015873014
The range of activity:  0  to  8  posts per week

933

A Jupyter Widget

 User: 933
posted one or more posts in 354 'bins'.
The first post:  2004-09-24 17:09:00
The last post:  2015-03-29 19:32:00
Activity spread over:  3838 days, 2:23:00
The average nr of posts per week:  0.644808743169399 including long times of inactivity.
The average nr of posts in non-empty weeks:  2.269230769230769
The range of activity:  0  to  12  posts per week

902

A Jupyter Widget

 User: 902
posted one or more posts in 3765 'bins'.
The first post:  2004-09-02 20:16:00
The last post:  2015-01-25 21:07:00
Activity spread over:  3797 days, 0:51:00
The average nr of posts per week:  6.933701657458563 including long times of inactivity.
The average nr of posts in non-empty weeks:  7.5
The range of activity:  0  to  33  posts per week

966

A Jupyter Widget

 User: 966
posted one or more posts in 118 'bins'.
The first post:  2004-10-27 22:51:00
The last post:  2005-02-14 10:28:00
Activity spread over:  109 days, 11:37:00
The average nr of posts per week:  7.375 including long times of inactivity.
The average nr of posts in non-empty weeks:  7.375
The range of activity:  1  to  27  posts per week

1552

A Jupyter Widget

 User: 1552
posted one or more posts in 877 'bins'.
The first post:  2006-07-11 12:39:00
The last post:  2014-02-12 12:49:00
Activity spread over:  2773 days, 0:10:00
The average nr of posts per week:  2.209068010075567 including long times of inactivity.
The average nr of posts in non-empty weeks:  3.334600760456274
The range of activity:  0  to  17  posts per week

905

A Jupyter Widget




KeyboardInterrupt: 

In [68]:
[(sentiment(x),x) for x in sent_tokenize("""hallo!
                                         Deze tekst bevat veel mooie zinnen.
                                         En ook smileys ;D . Ik heb geen idee.
                                         :D We zien wel denk ik!""")]

[((0.0, 0.0), 'hallo!'),
 ((0.7, 1.0), 'Deze tekst bevat veel mooie zinnen.'),
 ((0.25, 1.0), 'En ook smileys ;D .'),
 ((0.0, 0.0), 'Ik heb geen idee.'),
 ((1.0, 1.0), ':D We zien wel denk ik!')]

In [63]:
print len(word_tokenize("Deze zin bevat elf woorden, en een smiley aan het einde ;)"))

14
