In [1]:
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as md
%matplotlib notebook
import numpy as np
from srm.db_manager import connect_to_db, execute_sql

In [2]:
db_name, table = "srm", "BigBillionDay"

### Changes in retweet count over time
- We consider only those tweets that have more than 50 retweets

In [29]:
cursor_mysql, conn = connect_to_db("localhost", "root", "root")
parent_tweets = [row[0] for row in execute_sql("select parent_id_str from srm.BigBillionDay where parent_id_str in \
(SELECT  distinct id_str from srm.BigBillionDay) \
 group by parent_id_str having count(*) >50 order by count(*) desc;")]
retweet_counter = dict()
for parent_tweet in  parent_tweets [0:8] :
    timestamps, retweet_count = list(), list()
    cursor_mysql.execute("select min(created_at), max(created_at) from %s.%s where parent_id_str =%s"\
                         %(db_name, table, parent_tweet))
    min_date, max_date = cursor_mysql.fetchall()[0]
    print parent_tweet, min_date
    min_time_stamp = min_date
    max_time_stamp = min_date+datetime.timedelta(hours = 1) #both for looping
    while(max_time_stamp <= max_date):
        sql = "select count(*) from %s.%s where parent_id_str ='%s'\
                    and created_at >= '%s' and created_at < '%s';"
        cursor_mysql.execute(sql %(db_name, table, parent_tweet,str(min_time_stamp),str(max_time_stamp)))
        tweet_count = int(cursor_mysql.fetchall()[0][0])
        timestamps.append(min_time_stamp)
        retweet_count.append(tweet_count)
        min_time_stamp, max_time_stamp = max_time_stamp, max_time_stamp+datetime.timedelta(hours = 1)
    retweet_counter[parent_tweet] = (timestamps, retweet_count)   
conn.close()

519047798604328961 2014-10-06 08:54:39
519569635553718272 2014-10-07 19:27:22
519015438387122176 2014-10-06 06:45:15
519091490547445760 2014-10-06 11:48:08
519045394068545536 2014-10-06 08:44:13
519142153008586752 2014-10-06 15:09:44
518992246079639552 2014-10-06 05:13:09
518986819040329728 2014-10-06 04:51:28


In [69]:
print retweet_counter.keys()
tweet_id = '519047798604328961'

plt.xticks( rotation=50 )
ax=plt.gca()
#ax.set_xticks(dates)
xfmt = md.DateFormatter('%d %H:%M')
ax.xaxis.set_major_formatter(xfmt)
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9

plt.xlabel('Tweeted timestamp', fontsize=15)
plt.ylabel('Number of retweets', fontsize=15)
plt.title("id="+tweet_id, fontsize=15)

plt.plot(retweet_counter[tweet_id][0], retweet_counter[tweet_id][1])
plt.show()

['519091490547445760', '519045394068545536', '519015438387122176', '519569635553718272', '519142153008586752', '518986819040329728', '518992246079639552', '519047798604328961']


<IPython.core.display.Javascript object>

### Reinforcement Mechanism
Method taken from "Modeling and Predicting Retweeting Dynamics on
Microblogging Platforms"

select distinct parent_id_str from srm.BigBillionDay as TableA where parent_id_str in 
(SELECT  distinct id_str from srm.BigBillionDay) 
and (select count(*) from srm.BigBillionDay
	where parent_id_str = TableA.parent_id_str
	and created_at between '2014-10-06 06:00:00' and '2014-10-06 07:00:00' 
	group by parent_id_str)>0;

In [49]:
cursor_mysql, conn = connect_to_db("localhost", "root", "root")
min_time, max_time = '2014-10-06 06:00:00', '2014-10-06 07:00:00'

parent_tweets = [row[0] for row in execute_sql("select distinct parent_id_str from srm.BigBillionDay \
where parent_id_str in (SELECT  distinct id_str from srm.BigBillionDay) \
and created_at <'%s'", (min_time))]
#print parent_tweets

print "Number of tweets in this window:", len(parent_tweets)
windowed_retweets = dict()
for tweet in parent_tweets:
    resultset = execute_sql("select count(*) from srm.BigBillionDay \
    where parent_id_str = '%s' and created_at < '%s' group by parent_id_str", (tweet, min_time))
    if len(resultset)>0 :
        retweets_before_window = int(resultset[0][0])
    else:
        retweets_before_window = 0
    resultset = execute_sql("select count(*) from srm.BigBillionDay \
    where parent_id_str = '%s' and created_at between \
    '%s' and '%s'", (tweet, min_time, max_time)) 
    retweets_in_window = int(resultset[0][0])
    if retweets_in_window == 0: #Discard the ones that have no retweets
        continue
    #print tweet, retweets_before_window, retweets_in_window
    if windowed_retweets.has_key(retweets_before_window):
        windowed_retweets[retweets_before_window] = (windowed_retweets[retweets_before_window][0]+\
                                            retweets_in_window, windowed_retweets[retweets_before_window][1]+1)
    else:
        windowed_retweets[retweets_before_window] = (retweets_in_window, 1)
    
for retweet_count in windowed_retweets.keys():
    windowed_retweets[retweet_count] = \
        windowed_retweets[retweet_count][0]/float(windowed_retweets[retweet_count][1])
    #print retweet_count, windowed_retweets[retweet_count]
print "Number of tweets with retweets in the window", len(windowed_retweets.keys())

Number of tweets in this window: 1806
Number of tweets with retweets in the window 56


#### Plot it

In [50]:
fig,ax = plt.subplots()
x = windowed_retweets.keys()
y = windowed_retweets.values()
#Plot data and bestfit line seperately
data_plot = ax.scatter(x, y, label = "Data")
bestfit_line = ax.plot(x, np.poly1d(np.polyfit(x, y, 1))(x), label='Best fit line', linestyle=':', c = "green")

legend = ax.legend(loc='upper right', fontsize = 11)

plt.xlabel('$\Phi$', fontsize=15)
plt.ylabel('$\phi$', fontsize=15)

# plt.ylim([0, max(y)])
# plt.xlim([0, max(x)])

plt.title("Time window: %s to %s"%(min_time, max_time) , fontsize=12)

plt.show()

<IPython.core.display.Javascript object>