This notebook is the executionable version of Seeding bandits. It performs the following:
1. Data import and preprocessing;
2. Estimate creator level expected reward per receiver type;
3. Displays creator level behavior (per creator distribution of actions per receiver type)


In [9]:
#Run parameters

path_dir = r"/Users/../Volumes/Raw/"
music_before = 1500 # all creators kept, regardless of when they made content available
low_success = 0.5 #below the median: unsuccessful
high_success = 0.9 #top 10% creators with more followers are deemed successful

In [55]:
import sys  
import pickle
sys.path.insert(0, '/Users/caiorego/Desktop/BDS/RA/Seeding-Bandits/')
import numpy as np
import src.utils
from collections import Counter
from src.utils import import_dta, import_tracks_dta, successful_creators_followers,\
gen_active_relations, get_fan_interactions_per_week, calculate_avg_monthly_valence,\
gen_active_relations_prob, get_fan_interactions_per_week_prob, stripplot_prob,\
reaction_probability, follower_list, filter_quantile, sample_creators_music,\
gen_outbound_creators
import numpy as np
import datetime
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy

# Helper Functions

In [29]:
def process_date(date):
    '''convert date format like '2013-w09' to '2013-03-04', i.e. the first day of that week'''
    year = date[0:4]
    week = date[6:]
    day = "1"
    date = "{}-{}-1".format(year, week)
    dt = datetime.datetime.strptime(date, "%Y-%W-%w")
    return dt

# Data Imports

We start by importing the raw data.  `follows_sent`, `comments_sent`, `shares_sent`, `likes_sent` and `messages_sent` contains data pn the promotional activities that the 35k users tracked in the dataset directed to other users. It includes the `user_id`, the `fan_id` and the `date_sent` which identifies the date when the prom. activity was sent. `users_info_1st` shows the type of user (creator or non-creator, which is identified by a blank) and the date the user entered the platform, for every user that sent or received prom. activities from any of the 35k users tracked in this dataset, while `users_info` contains the same information, but pertaining to the 35k users themselves.

`follows_received` contains information on the follows received by the 35k users and will be used to generate the successful/unsuccessful groups of content creators.

In [6]:
#affiliations :follows
#favoritings :likes

#used in filtering:
path_dir = r"/Users/../Volumes/Raw/"
tracks = import_tracks_dta(path_dir, "12sample_tracks.dta");

#these are the actions sent to 
follows_sent = import_dta(path_dir, "12sample_affiliations_sent.dta");
comments_sent = import_dta(path_dir, "12sample_comments_made.dta");
shares_sent = import_dta(path_dir, "12sample_reposts_made.dta");
likes_sent = import_dta(path_dir, "12sample_favoritings_made.dta");
messages_sent = import_dta(path_dir, "12sample_messages_sent.dta");

#Used to track information on the 1st degree connections
user_info_1st = import_dta(path_dir, "12sample_1st_deg_user_infos.dta");
user_info_1st.columns = ['user_id', 'type', 'entered_platform'];
user_info = import_dta(path_dir, "12sample_user_infos.dta");

#Used to compute creator's success measure
follows_received = import_dta(path_dir, "12sample_affiliations_received.dta");

%%%%%%%%%% 12sample_tracks.dta %%%%%%%%%%
(56262, 7)
%%%%%%%%%% 12sample_affiliations_sent.dta %%%%%%%%%%
(800913, 3)
%%%%%%%%%% 12sample_comments_made.dta %%%%%%%%%%
(29258, 4)
%%%%%%%%%% 12sample_reposts_made.dta %%%%%%%%%%
(179329, 4)
%%%%%%%%%% 12sample_favoritings_made.dta %%%%%%%%%%
(527701, 4)
%%%%%%%%%% 12sample_messages_sent.dta %%%%%%%%%%
(11091, 3)
%%%%%%%%%% 12sample_1st_deg_user_infos.dta %%%%%%%%%%
(670746, 3)
%%%%%%%%%% 12sample_user_infos.dta %%%%%%%%%%
(35000, 3)
%%%%%%%%%% 12sample_affiliations_received.dta %%%%%%%%%%
(432503, 3)


Indegree and outdegree information.

The functions below import the indegree and outdegree dataset. Because the raw version of those dataset are too large to be processed in memory, we preprocessed them in a separate script.

In [20]:
# imports the preprocessed indegree data.
# the data was previously split in 9 pickeld pd.dataframes for memory reasons
def import_indegree_dask(path='/Users/caiorego/Desktop/BDS/RA/Seeding-Bandits/indegree/'):
    df = pd.read_pickle('{}df0.pkl'.format(path))
    df1 = pd.read_pickle('{}df1.pkl'.format(path))
    df2 = pd.read_pickle('{}df2.pkl'.format(path))
    df3 = pd.read_pickle('{}df3.pkl'.format(path))
    df4 = pd.read_pickle('{}df4.pkl'.format(path))
    df5 = pd.read_pickle('{}df5.pkl'.format(path))
    df6 = pd.read_pickle('{}df6.pkl'.format(path))
    df7 = pd.read_pickle('{}df7.pkl'.format(path))
    df8 = pd.read_pickle('{}df8.pkl'.format(path))
    df9 = pd.read_pickle('{}df9.pkl'.format(path))
     
    #convert pd.dataframe to dask.dataframe, which better suits big data.
    ddf = dd.from_pandas(df, npartitions = 3)
    ddf1 = dd.from_pandas(df1, npartitions = 3)
    ddf2 = dd.from_pandas(df2, npartitions = 3)
    ddf3 = dd.from_pandas(df3, npartitions = 3)
    ddf4 = dd.from_pandas(df4, npartitions = 3)
    ddf5 = dd.from_pandas(df5, npartitions = 3)
    ddf6 = dd.from_pandas(df6, npartitions = 3)
    ddf7 = dd.from_pandas(df7, npartitions = 3)
    ddf8 = dd.from_pandas(df8, npartitions = 3)
    ddf9 = dd.from_pandas(df9, npartitions = 3)
    
    concatdf = dd.multi.concat([ddf,ddf1,ddf2,ddf3,ddf4,ddf5,ddf6,ddf7,ddf8,ddf9])
    
    return concatdf

In [21]:
# Aggregates preprocessed outdegree of 1st degree users
def import_outdegree(path='/Users/caiorego/Desktop/BDS/RA/Seeding-Bandits/'):
    d = {}
    for i in range(6):
       d[str(i)] = pd.read_pickle(os.path.join(file_path,'{}.pkl'.format(i))) 
       d[str(i)]['created_at'] =  pd.to_datetime(d[str(i)]['created_at'])
    
    data_outdegree = pd.concat([d['0'], d['1'], d['2'], d['3'], d['4'], d['5']])
    data_outdegree.set_index('created_at', inplace = True)
    return data_outdegree

In [22]:
data_indegree = import_indegree_dask()

In [19]:
data_outdegree = import_outdegree()

# Preprocessing

## Creator ids, successful and unsucessful creators

Next, we define three lists of ids: one with the ids from the content creators, according to the `users_info` table, one with the ids of successful creators and the last one with the ids of the unsuccessful ones.

Let's start with a list of the id of creators. We also create a dataset with containing information on creators only.

In [15]:
mask = user_info.type == 'creator'
creator_ids = user_info[mask].user_id.unique()

creators = user_info[user_info.type == 'creator']

Now we create a function that 

In [18]:
def successful_creators_followers(follows_received, base_date = datetime.datetime(2016, 5, 30, 0, 0), perc1 = None, perc2 = None, subset_creators = None):
    '''Classifies content creators in successful or unsuccessfull
        Arguments:
                    follows_received: dataframe containing the follows received by content creators
                    base date:        date, in datetime.datetime(YYYY, M, DD, H, M) format, in which the number 
                                      of followers per creator is calculated.
                    perc1:            the threshold used to classify unsuccessful content creators. Creator having 
                                      total followers below the number dictated by this threshold, at the base date,
                                      are classified as unsuccessful 
                    perc2:            the threshold used to classify successful content creators. Creator having 
                                      total followers above the number dictated by this threshold, at the base date,
                                      are classified as successful
                    subset_creators:  a pd.DataFrame containing the creators. If is it available, it will be used to 
                                      filter out non creators and to make sure creators with 0 followers are part of
                                      the resulting dataset.
        
    '''
    print(base_date)

    if 'inbound_activity' not in follows_received.columns:
        follows_received.columns = ['fan_id', 'user_id', 'date_sent']

    mask = (follows_received['date_sent'] < base_date)

    df = follows_received[mask].groupby('user_id', as_index=False).agg({'fan_id': pd.Series.nunique})
    df.columns = ['user_id', 'followers']

    if type(subset_creators) == pd.DataFrame:
        subset_creators = pd.DataFrame(subset_creators.user_id.unique(), columns = ['user_id'])
        df = subset_creators.merge(df, on = 'user_id', how = 'left')
        df.fillna(0, inplace = True)

    low = np.quantile(df.followers, perc1)
    high = np.quantile(df.followers, perc2)

    print("High influencer boundary: {}".format(high))
    print("Low influencer boundary: {}".format(low))

    mask = (df["followers"] <= low) | (df["followers"] >= high)
    
    unsuccessful_creator_ids = df.loc[df["followers"] <= low].user_id.unique()
    successful_creator_ids = df.loc[df["followers"] >= high].user_id.unique()

    return unsuccessful_creator_ids, successful_creator_ids

In [19]:
unsuccessful_ids, successful_ids = successful_creators_followers(follows_received, 
                                                        perc1 = low_success, perc2 = high_success, subset_creators = creators)


2016-05-30 00:00:00
High influencer boundary: 66.0
Low influencer boundary: 9.0


## Putting together a dataset with the promotional activities made by content creators.

The function `gen_actions_sent_df` creates a dataframe with all the promotional activities that content creators sent to users.

In [62]:
def gen_actions_sent_df(follows_sent, shares_sent, likes_sent, comments_sent, messages_sent, creator_ids = creator_ids):
    '''
    Creates dataframe containing the actions that content creators send to users.
        Attributes:
                    follows_sent:  dataframe with the follows sent by the 35k users.
                    shares_sent:   dataframe with the shares sent by the 35k users.
                    likes_sent:    dataframe with the likes sent by the 35k users.
                    comments_sent: dataframe with the comments sent by the 35k users.
                    messages_sent: dataframe with the messages sent by the 35k users.
                    creator_ids:   list with content creator ids. If not none, is used to
                                   filter out activities from non creators.
    '''
    
    follows_sent['outbound_activity'] = 'follow'
    follows_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    if 'song_id' in shares_sent.columns:
        shares_sent.drop(columns=["song_id"])
    shares_sent = shares_sent[['reposter_id', "owner_id", 'created_at']]
    shares_sent['outbound_activity'] = 'share'
    shares_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    if 'track_id' in likes_sent.columns:
        likes_sent.drop(columns=["track_id"], inplace=True)
    likes_sent['outbound_activity'] = 'like'
    likes_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    if 'track_id' in comments_sent.columns:
        comments_sent.drop(columns=["track_id"], inplace=True)
    comments_sent['outbound_activity'] = 'comment'
    comments_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    messages_sent["outbound_activity"] = 'message'
    messages_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']
    df = pd.concat([follows_sent, shares_sent, likes_sent, comments_sent, messages_sent])


    if type(creator_ids) == numpy.ndarray:
        df = df[df['user_id'].isin(creator_ids)]
        
    df['week_yr'] = df.date_sent.dt.strftime('%Y-w%U')
    df = df.loc[df['user_id'] != df['fan_id'],:]

    return df

In [63]:
actions_sent = gen_actions_sent_df(follows_sent, shares_sent, likes_sent, comments_sent,
                                     messages_sent)

filtering creators


## Filter only actions that were sent to non-fans

Since are interested in acquisition of fans, we must filter the `actions_sent` to contain only the promotional activities sent to non-fans. 
We start by selecting only the necessary columns of the `follows_received` table and merging it to the `actions_sent` dataset.

In [64]:
follows_received.columns = ['fan_id', 'user_id', 'date_sent']
followers = follows_received[["fan_id", "user_id", "date_sent"]]
followers.columns = ["fan_id", "user_id", "follower_since"]

actions_sent = actions_sent.merge(followers, right_on = ['user_id', 'fan_id'],
                                      left_on = ['user_id', 'fan_id'], how = 'left')

We then filter only actions that happened before the user follows the content creator.

In [65]:
mask = (actions_sent.date_sent < actions_sent.follower_since) | (actions_sent.follower_since.isnull())
actions_sent_non_followers =  actions_sent[mask]
actions_sent_non_followers['week_yr_date'] = actions_sent_non_followers.week_yr.apply(lambda x: process_date(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actions_sent_non_followers['week_yr_date'] = actions_sent_non_followers.week_yr.apply(lambda x: process_date(x))


In [66]:
actions_sent_non_followers.shape

(403066, 7)

## Create rewards

The last step in the data preprocessing is the creation of the rewards column. We do it with the function below. It adds some flexibility to the process by allowing us to change the definition of a reward with the parameter `interval`.

In [67]:
def create_reward(data_actions_sent, interval):
    # Target Creation
    delta = datetime.timedelta(days = interval)
    mask = (data_actions_sent['follower_since'] <= (data_actions_sent['date_sent'] + delta).dt.floor('d'))

    data_actions_sent.loc[mask, 'reward'] = 1
    mask = data_actions_sent['reward'].isnull()
    data_actions_sent.loc[mask, 'reward'] = 0
    return data_actions_sent

In [68]:
labeled_dateset = create_reward(actions_sent_non_followers,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_actions_sent.loc[mask, 'reward'] = 1


# The multiarmed bandit estimates

Below, we implement `MultiArmedBandits` class. This class is used to track exploration and exploitation of agents facing a multi armed problem. It is initialized with arms' names and the content creator id as arguments. 
It's methods enable us to track each arm's trials and rewards, aswell as estimate the mean reward and reward variance.
The `update_batch` method allows for batched updates on the multiarmed bandits.

In [None]:
class MultiArmedBandit:
    
    def __init__(self, arm_names, user_id):
        self.user_id = user_id
        self.arm_names = arm_names
        self.arms = {k:{'Sent':0,'Reward':0} for k in arm_names}
        self.batch = {k:{'Sent':0,'Reward':0} for k in arm_names}
        self.means = np.array(len(arm_names))
        self.variances = np.array(len(arm_names))
        self.last_arm_pulled = None
        self.arm_switches = 0
        self.total_trials = 0 
        self.trials_at_day = 0
        self.rewards = 0
    
    def initialize_priors(self):
        pass
    
    def update_batch(self, target, outcome):
        self.batch[target]['Sent'] += 1
        self.batch[target]['Reward'] += outcome
        self.arm_switches += self.arm_switches + (self.last_arm_pulled != str(target))
        self.last_arm_pulled = str(target)
        self.total_trials += 1
        self.trials_at_day += 1
        self.rewards += outcome
    
    def update_arms(self):
        for k in self.batch.keys():
            for j in self.batch[k]:
                self.arms[k][j] += self.batch[k][j]
        self.batch = {k:{'Sent':0,'Reward':0} for k in self.arm_names}
        self.trials_at_day = 0
                
    def estimate_moments(self):
        
        Sent = np.array([a[1]['Sent'] for a in self.arms.items()])
        Reward = np.array([a[1]['Reward'] for a in self.arms.items()])
        
        alpha = Reward + 1 
        beta = (Sent-Reward)+1

        mean = alpha/(alpha+beta)
        variance = (alpha*beta)/((alpha+beta+1)*((alpha+beta)**2))
        
        self.means = np.transpose(np.around(mean,3))
        self.variances = np.transpose(np.around(np.sqrt(variance),3))

## Updating the membership table

The arms of the multiarmed bandit used in the notebook are based on user types. The user types are based on the indegree of the receivers, and are, therefore, dynamic. The function below receives a date and generates the user types at that date.

In [None]:
def update_membership_table_dask(data, date, user_info = user_info_1st, d_percentiles = 10):
    
    '''
    This function returns the membership table at date equals `date`. Every user that interacted with the 35k tracked 
    users and entered the platform before `date` is present in the table, even if it has indegree 0.
    arguments:
              data:           the indegree dataset.
              user_info:      the dataset containing all the users that interacted with the 35k users tracked.
              d_percentiles:  the percentiles breaks e.g. d_percentiles = 25 means that user types are the quartiles
                              of the indegree distribution.
    '''
    
    data = data[data.created_at.dt.floor('d')<=upper_limit]
    data = data.groupby('contact_id').agg({'size':'sum'}).compute()
    steps = 100//d_percentiles
    
    #merge with user info to obtain users that are not followed by anyone at the current date
    data = user_info_1st.merge(data, left_on = 'user_id', right_on = 'contact_id', how= 'outer')
    data.loc[data['size'].isnull(), 'size'] = 0
    data = data[['user_id', 'size', 'entered_platform']].set_index('user_id')
    
    #filter out users that didnt exist in the current date
    mask = data['entered_platform'].dt.floor('d') <= date
    data = data.loc[mask]
    
    mask = (data['size']>0)
    data.loc[~mask, 'type'] = 0 
    
    #cutpoints ignore users with 0 followers as they will appear in their own category
    cutpoints = np.percentile(data['size'], np.arange(0,100, steps)) 

    for i in range(len(cutpoints)):
        low = cutpoints[i]
        try:
            high = cutpoints[i+1]
        except:
            high = 100000
        
        mask2 = (data[mask]['size']>=low) & (data[mask]['size'] <high) & (data['size']>0)
        data.loc[mask2,'type'] = i + 1 
    return data

Now we are ready to run the mabs that simulate the content creators exploration/exploitation.

In [None]:
# create arm names that will be used in the MultiArmedBandit instances.
arm_names = [i for i in range(0,11)]

# filter out content creators that are neither successful or unsuccessful.
mask = (actions_sent_non_followers.user_id.isin(successful_ids) | actions_sent_non_followers.user_id.isin(unsuccessful_ids))
df = actions_sent_non_followers[mask]

# initializes dictionaries that will store the values.
#d will store the MultiArmedBandit instances
#e will store the estimated mean and variances.
d = {}
e = {}

#initialize parameters
first_day = min(actions_sent_non_followers.date_sent.dt.floor('d').unique())
last_day =  max(actions_sent_non_followers.date_sent.dt.floor('d').unique())
membership = update_membership_table_dask(data_in, first_day, d_percentiles = 10)
fan_not_found = [] #list that stores fans that were not found in the membership table.
user_ids = df.user_id.unique()

#initialize MABs and auxiliary datasets.
for user_id in df.user_id.unique():
    d[user_id] = MultiArmedBandit(arm_names, user_id)

    dataset_mean = pd.DataFrame({**{'date':pd.date_range(start = first_day, end=last_day)},
                                 **{str(k):0 for k in range(0,11)}}) 
    dataset_std = pd.DataFrame({**{'date':pd.date_range(start = first_day, end=last_day)},
                                 **{str(k):0 for k in range(0,11)}}) 
    trials = pd.DataFrame({**{'date':pd.date_range(start = first_day, end=last_day)},
                                 **{'trials_at_date':np.nan}}) 
    
    e[user_id] = {'dataset_std': dataset_std, 'dataset_mean': dataset_mean, 'trials' : trials}

#update MABs
for day in tqdm(actions_sent_non_followers.date_sent.dt.floor('d').unique()):
    for user_id in df.loc[df['date_sent'].dt.floor('d') == day].user_id.unique():
        df_subset = df.loc[(df['date_sent'].dt.floor('d') == day)&(df['user_id'] == user_id)]
        for action in range(len(df_subset)):
            fan_id = df_subset.iloc[action].fan_id
            try:
                user_type = membership.loc[membership.index == fan_id].type.values[0]
                d[user_id].update_batch(user_type, df_subset.iloc[action].reward)
            except KeyError:
                fan_not_found.append([fan_id, day, df_subset.iloc[action].outbound_activity])
            except IndexError:
                fan_not_found.append([fan_id, day, df_subset.iloc[action].outbound_activity])
  #REWRITE THIS AS A LOOP OVER ALL USER_IDS          
    for user_id in user_ids:
        e[user_id]['trials'].loc[e[user_id]['trials'].date == day, 'trials_at_date'] = d[user_id].trials_at_day
        d[user_id].update_arms()
        d[user_id].estimate_moments()
        e[user_id]['dataset_mean'].loc[e[user_id]['dataset_mean'].date == day, '0':] = np.transpose(d[user_id].means)
        e[user_id]['dataset_std'].loc[e[user_id]['dataset_mean'].date == day, '0':] = np.transpose(d[user_id].variances)
      
   #update membership table at the end of each day 
    membership = update_membership_table_dask(data_in, day, d_percentiles = 10)
    
for user_id in df.user_id.unique():
    e[user_id]['total_trials'] = d[user_id].total_trials

Now that we have the MAB estimates, it is time to check content creators behavior.
The following code is outdated, as tracks the behavior of all individuals in the `df` dataframe. It was initially intended to compare the behavior of successful and unsuccessful creators, but it might be more informative to track the behavior of individual creators.

In [None]:
def weekly_membership(df, data_in):
    '''
    Function receives the actions performed data frame and the indegree data. Returns the df with an extra columns,
    identifying the user type that received an action.
        Arguments:
                    df:      dataframe containing the actions sent by content creators.
                    data_in: indegree dataset
    '''

    df_copy = df.copy()
    df_copy['type'] = 0
    
    first_date = min(df_copy.date_sent)
    membership = update_membership_table_dask(data_in, first_day, d_percentiles = 10)
    
    for day in tqdm(df_copy.date_sent.dt.floor('d').unique()):
        mask = (df_copy['date_sent'].dt.floor('d') == day)
        df_subset = df_copy[mask]
        for i in range(df_copy[mask].shape[0]):
            fan_id = df_subset.iloc[i].fan_id
            try:
                df_subset.iloc[i,-1] = (int(membership.loc[membership.index == fan_id]['type']))
                print('.', end='')
        
            except:
                
                df_subset.iloc[i,-1] = 'Fan not found'
                print('x', end ='')
                
        df_copy[mask] = df_subset
        membership = update_membership_table_dask(data_in, day, d_percentiles = 10)

    return df_copy

The function below plots the behavior information.

In [None]:
def plot_learning(data, number, title):
    '''
    Plots the targeting behavior of content creators
    '''
    
    
    data_grouped = data.groupby('week_yr_date', as_index = False).size()
    data.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity',
       'interaction_week', 'interaction_year', 'week_yr',
       'follower_since', 'week_yr_date', 'reward',
       'Receiver Type']
    
    lim_inf = min(data.date_sent)
    lim_sup = max(data.date_sent)
    
    fig, ax = plt.subplots(figsize=(15, 9))

    hue_order_l = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

    sns.set(rc={'figure.figsize':(12,8)})
    sns.set_style("white")
    sns.kdeplot(data=data,
            x="week_yr_date",
            hue='Receiver Type',
            hue_order= hue_order_l,
            multiple="fill",
            #weights="Val",
            bw_adjust = 1)\
    .set(xlim=(lim_inf, lim_sup), title='Figure {}: {} - Share of trials per receiver type.'.format(number, title), xlabel='Week',
     ylabel='Share')
    
    ax2 = plt.twinx()
    sns.lineplot(data=data_grouped,
                 x = "week_yr_date", 
                 y = "size", 
                 color="black", 
                 ax=ax2, 
                 legend = 'auto',
                 linewidth = 2)\
    .set(ylabel='Total Promotional Actions', yscale="log")
    plt.annotate('''Note: each color corresponds to a receiver type. The black line is the total number of actions per week.'''
             , (0,0), (0, -35), xycoords='axes fraction', textcoords='offset points', va='top', fontsize=14);

    #ax.legend(title='Receiver type', loc='best', labels = ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1', '0']);

In [None]:
mask = (actions_sent_non_followers.user_id.isin(successful_ids) | actions_sent_non_followers.user_id.isin(unsuccessful_ids))
df = actions_sent_non_followers[mask]
dynamics = weekly_membership(df, data_in)

In [None]:
mask = (dynamics.user_id.isin(successful_ids)) & (dynamics['Receiver Type'] != 'Fan not found')
success_dynamics = dynamics[mask]
plot_learning(success_dynamics, 21, 'Successful')

In [None]:
mask = (dynamics.user_id.isin(unsuccessful_ids)) & (dynamics['Receiver Type'] != 'Fan not found')
unsuccess_dynamics = dynamics[mask]
plot_learning(unsuccess_dynamics, 22, 'Unsuccessful')

In [69]:
def gen_estimates(df, arm_names):

    '''
    Summarizes all the infomration present in dictionarys d and e into tables with the MABs` information
    '''
    
    dataset_last_mean = pd.DataFrame(index = df.user_id.unique(), columns = arm_names)
    dataset_last_std = pd.DataFrame(index = df.user_id.unique(), columns = arm_names)
    df_trial_info = pd.DataFrame(index = df.user_id.unique(), columns = ['total_trials'])
    df_summary_rewards = pd.DataFrame(index = df.user_id.unique(), columns = ['Rewards'])
    arm_names = [i for i in range(0,11)]

    for user_id in df.user_id.unique():
        try:
            dataset_last_mean.loc[user_id, :] = list(e[user_id]['dataset_mean'].iloc[-1,1:])
            dataset_last_std.loc[user_id, :] = list(e[user_id]['dataset_std'].iloc[-1,1:])
            df_trial_info.loc[user_id, :] = d[user_id].total_trials
            df_summary_rewards.loc[user_id, :] = d[user_id].rewards
        except:
            pass

    init = 0
      
    df_trial_info = pd.DataFrame(index = arm_names , columns = ['Rewards', 'Unique_creators', 'trials'])
    for j in range(len(arm_names)):
        df_trial_info.iloc[j-init]['trials'] = list()
        df_trial_info.iloc[j-init]['Unique_creators'] = 0
        df_trial_info.iloc[j-init]['Rewards'] = 0
    for key in df.user_id.unique():
        for j in range(len(arm_names)):
            df_trial_info.iloc[j-init]['Rewards'] += d[key].arms[j]['Reward']
            if d[key].arms[j]['Sent'] != 0:
                df_trial_info.iloc[j-init]['Unique_creators'] = df_trial_info.iloc[j-init]['Unique_creators']+1
            df_trial_info.iloc[j-init]['trials'].extend([d[key].arms[j]['Sent']])
    df_trial_info['Total_trials'] = df_trial_info.trials.apply(lambda x: np.sum(x))
    df_trial_info['Median_trials'] = df_trial_info.trials.apply(lambda x: np.median(x))
    df_trial_info['Mean_trials'] = df_trial_info.trials.apply(lambda x: np.mean(x))
    df_trial_info['Std_trials'] = df_trial_info.trials.apply(lambda x: np.std(x))

    summary = pd.DataFrame(dataset_last_mean[arm_names].mean(), columns = ['Mean Estimate']).merge(
            pd.DataFrame(dataset_last_std[arm_names].mean(), columns = ['Std Estimate']), left_index=True, right_index=True).merge(
        df_trial_info[['Rewards','Unique_creators','Total_trials','Median_trials', 'Mean_trials', 'Std_trials']
            ],left_index=True, right_index=True).reindex(arm_names)
    
    return summary

In [None]:
mask = (actions_sent_non_followers.user_id.isin(successful_ids))
df = actions_sent_non_followers[mask]
print('Unique successful creators:', df.user_id.nunique())
successful_summary = gen_estimates(df, arm_names)

In [None]:
successful_summary

In [None]:
successful_summary.to_excel('tables/sucessful.xlsx')

In [None]:
mask = (actions_sent_non_followers.user_id.isin(unsuccessful_ids))
df = actions_sent_non_followers[mask]
print('Unique unsuccessful creators:', df.user_id.nunique())
unsuccessful_summary = gen_estimates(df, arm_names)

In [None]:
unsuccessful_summary.to_excel('tables/unsucessful.xlsx')