In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from srm.db_manager import connect_to_db, execute_sql
import time
import math
from scipy.stats import norm
from collections import defaultdict



In [2]:
def param_estimate(t_train, T):
    """ For accurate calculation, use the last timestamp from t_train as T and the rest shall be passed as 
    commandline arguments to estimate parameters
    """
    t_args = "\t".join(str(x) for x in t_train[:-1])
    #print command_arg
    res = !./rpp  $T $t_args
    #print res
    l, mu, sigma = (float(x) for x in "".join(res).split('\t'))
    #print l, mu, sigma
    return l, mu, sigma

def predict(l, mu, sigma, T, nd, tp) :
    m = 10 # Prior bilief. Make sure m_m in mic_model.cpp has the same value
    f_tp = norm.cdf((math.log(tp) - mu)/sigma)
    norm.cdf(f_tp)    
    f_T = norm.cdf((math.log(T) - mu)/sigma)
    norm.cdf(f_T)
    cn = (m + nd)*math.exp((f_tp - f_T)*l) - m
    #print "Predicted", int(math.ceil(cn))
    #print cn
    try:
        te = int(math.ceil(cn))
        return te
    except:
        #print cn
        return -1
    
def mape(ta, tp):
    """Calculated MAPE for actual values ta and predicted values tp
    """
    diff = 0
    n = len(ta)
    for i in range(n):
        diff+= abs(ta[i]-tp[i])/float(ta[i])
    return diff/n


In [3]:
db_name, table = "srm", "BigBillionDay"
cursor_mysql, conn = connect_to_db("localhost", "root", "root")

To get the graph MAPE vs time after training timestamp

In [4]:
parent_tweets = [row[0] for row in execute_sql("select parent_id_str from srm.BigBillionDay where parent_id_str in \
(SELECT  distinct id_str from srm.BigBillionDay) \
 group by parent_id_str having count(*) >50 order by count(*) desc;")]
#print parent_tweets
prediction_dict = defaultdict(list) #{tweetid:(time after training, real retweet#, predicted retweet#)}

for parent_id in parent_tweets :
    #print "Parent ID", parent_id
    retweet_times = [row[0] for row in execute_sql("select created_at from srm.BigBillionDay where parent_id_str \
                                                            =%s order by created_at", parent_id)]
    parent_created_time = [row[0] for row in execute_sql("select created_at from srm.BigBillionDay where id_str \
                                                            =%s", parent_id)]
    created_times = parent_created_time #Stores time at which the tweet and retweets are created
    created_times.extend(retweet_times)
    t_original = list()
    for ts in created_times : 
        t_original.append(int(time.mktime(ts.timetuple())-time.mktime(created_times[0].timetuple())))
    
    #To avoid errors in calculation, as suggested in the paper, we add a constant 1 to the time momemnts
    t_original = [x+1 for x in t_original]
    
    #Training 
    split_limit = int(len(t_original)*.7) #70% for training 30 % for testing
    t_train, t_test = t_original[0:split_limit], t_original[split_limit:]
    T = t_train[-1]
    l, mu, sigma = param_estimate(t_train, T)
    
    #Calculation of MAPE
    nd = len(t_train[:-1])
    
    for tp in t_test:
        te = predict(l, mu, sigma, T, nd, tp)
        if te == -1 : 
            continue
        #print "Real", t_original.index(tp)+1
        if not prediction_dict.has_key(parent_id):
            prediction_dict[parent_id] = [(tp-T, t_original.index(tp)+1, te)]
        else :
            prediction_dict[parent_id].append((tp-T, t_original.index(tp)+1, te))


In [76]:
print prediction_dict[prediction_dict.keys()[2]]
print prediction_dict.keys()[2]
print len(prediction_dict[prediction_dict.keys()[2]])

[(14, 187, 186), (213, 188, 187), (694, 189, 191), (1482, 190, 196), (1502, 191, 197), (1521, 192, 197), (1551, 193, 197), (1888, 194, 200), (2246, 195, 202), (2393, 196, 203), (3670, 197, 213), (4024, 198, 216), (4080, 199, 217), (4361, 200, 219), (4582, 201, 221), (4708, 202, 222), (4902, 203, 223), (5733, 204, 230), (6653, 205, 238), (6927, 206, 241), (7031, 207, 242), (7294, 208, 244), (7620, 209, 247), (7631, 210, 247), (7675, 211, 247), (7824, 212, 249), (7834, 213, 249), (7914, 214, 249), (8407, 215, 254), (9307, 216, 262), (9798, 217, 267), (10181, 218, 270), (10979, 219, 278), (11093, 220, 279), (11740, 221, 285), (11929, 222, 287), (12579, 223, 293), (12771, 224, 295), (13481, 225, 303), (13558, 226, 303), (13561, 227, 303), (14152, 228, 309), (14343, 229, 311), (14520, 230, 313), (14570, 231, 314), (15095, 232, 319), (15174, 233, 320), (15266, 234, 321), (15734, 235, 326), (16285, 236, 332), (16528, 237, 335), (17669, 238, 347), (18273, 239, 354), (18474, 240, 357), (18932, 

In [68]:
xp, xe = list(), list()
for k in prediction_dict.keys():
    #print prediction_dict[k]
    xp.extend([x[1] for x in prediction_dict[k]])
    xe.extend([x[2] for x in prediction_dict[k]])
print len(xp) == len(xe)
print mape(xp, xe)

True
8.15550603756e+51
