**This notebook is the executable version of lab note 1 (note: the overleaf file is momentarely outdated).
It answers the following questions:**

    1. Is there statistical difference on the activity level of hardcore and lazy users?
    2. Is there statistical difference on the activity level of addicted and shallow users?
    3. Descriptive statistics of the four groups.
    4.a. Are hardcore and addicted users more connected to successful creators than lazy and shallow?
    4.b. Are hardcore and addicted users more connected to successful creators than to unsuccessful creators?

In [1]:
#Run parameters
path_dir = r"/Users/../Volumes/Raw/"

low_success = 0.5 #below the median: unsuccessful
high_success = 0.9 #top 10% creators with more followers are deemed successful

In [2]:
import sys  
import pickle
sys.path.insert(0, '/Users/caiorego/Desktop/BDS/RA/Seeding-Bandits/')
import numpy as np
import src.utils
from collections import Counter
from src.utils import import_dta, import_tracks_dta, successful_creators_followers,\
gen_active_relations, get_fan_interactions_per_week, calculate_avg_monthly_valence,\
gen_active_relations_prob, get_fan_interactions_per_week_prob, stripplot_prob,\
reaction_probability, follower_list, filter_quantile, sample_creators_music,\
gen_outbound_creators
import numpy as np
import datetime
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy
import os
from statsmodels.stats.proportion import proportions_ztest

In [3]:
def process_date(date):
    '''convert date format like '2013-w09' to '2013-03-04', i.e. the first day of that week'''
    year = date[0:4]
    week = date[6:]
    day = "1"
    date = "{}-{}-1".format(year, week)
    dt = datetime.datetime.strptime(date, "%Y-%W-%w")
    return dt

# Data Imports

We start by importing the raw data.  `follows_sent`, `comments_sent`, `shares_sent`, `likes_sent` and `messages_sent` contains data pn the promotional activities that the 35k users tracked in the dataset directed to other users. It includes the `user_id`, the `fan_id` and the `date_sent` which identifies the date when the prom. activity was sent. `users_info_1st` shows the type of user (creator or non-creator, which is identified by a blank) and the date the user entered the platform, for every user that sent or received prom. activities from any of the 35k users tracked in this dataset, while `users_info` contains the same information, but pertaining to the 35k users themselves.

`follows_received` contains information on the follows received by the 35k users and will be used to generate the successful/unsuccessful groups of content creators.

In [4]:
#affiliations :follows
#favoritings :likes

#used in filtering:
path_dir = r"/Users/../Volumes/Raw/"
tracks = import_tracks_dta(path_dir, "12sample_tracks.dta");

#these are the actions sent to 
follows_sent = import_dta(path_dir, "12sample_affiliations_sent.dta");
comments_sent = import_dta(path_dir, "12sample_comments_made.dta");
shares_sent = import_dta(path_dir, "12sample_reposts_made.dta");
likes_sent = import_dta(path_dir, "12sample_favoritings_made.dta");
messages_sent = import_dta(path_dir, "12sample_messages_sent.dta");

#Used to track information on the 1st degree connections
user_info_1st = import_dta(path_dir, "12sample_1st_deg_user_infos.dta");
user_info_1st.columns = ['user_id', 'type', 'entered_platform'];
user_info = import_dta(path_dir, "12sample_user_infos.dta");

#Used to compute creator's success measure
follows_received = import_dta(path_dir, "12sample_affiliations_received.dta");

%%%%%%%%%% 12sample_tracks.dta %%%%%%%%%%
(56262, 7)
%%%%%%%%%% 12sample_affiliations_sent.dta %%%%%%%%%%
(800913, 3)
%%%%%%%%%% 12sample_comments_made.dta %%%%%%%%%%
(29258, 4)
%%%%%%%%%% 12sample_reposts_made.dta %%%%%%%%%%
(179329, 4)
%%%%%%%%%% 12sample_favoritings_made.dta %%%%%%%%%%
(527701, 4)
%%%%%%%%%% 12sample_messages_sent.dta %%%%%%%%%%
(11091, 3)
%%%%%%%%%% 12sample_1st_deg_user_infos.dta %%%%%%%%%%
(670746, 3)
%%%%%%%%%% 12sample_user_infos.dta %%%%%%%%%%
(35000, 3)
%%%%%%%%%% 12sample_affiliations_received.dta %%%%%%%%%%
(432503, 3)


In [5]:
mask = user_info.type == 'creator'
creator_ids = user_info[mask].user_id.unique()

creators = user_info[user_info.type == 'creator']

Indegree and outdegree information.

The function below import the outdegree dataset. Because the raw version of those dataset are too large to be processed in memory, we preprocessed them in a separate script.

In [6]:
# Aggregates preprocessed outdegree of 1st degree users
def import_outdegree(path='/Users/caiorego/Desktop/BDS/RA/Seeding-Bandits/'):
    d = {}
    for i in range(6):
       d[str(i)] = pd.read_pickle(os.path.join(path,'{}.pkl'.format(i))) 
       d[str(i)]['created_at'] =  pd.to_datetime(d[str(i)]['created_at'])
       d[str(i)]['created_at'] = pd.to_datetime(d[str(i)]['created_at']).dt.floor('d')
       d[str(i)] = d[str(i)].groupby(['sender_id', 'created_at'], as_index = False).size() 
    
    data_outdegree = pd.concat([d['0'], d['1'], d['2'], d['3'], d['4'], d['5']])
    #data_outdegree.set_index('created_at', inplace = True)
    return data_outdegree

In [7]:
data_outdegree = import_outdegree()

In [8]:
#data_outdegree = data_outdegree.groupby(['sender_id','created_at'], as_index = False).size()

# Preprocessing

## Creator ids, successful and unsucessful creators

Next, we define three lists of ids: one with the ids from the content creators, according to the `users_info` table, one with the ids of successful creators and the last one with the ids of the unsuccessful ones.

Let's start with a list of the id of creators. We also create a dataset with containing information on creators only.

In [9]:
mask = user_info.type == 'creator'
creator_ids = user_info[mask].user_id.unique()

creators = user_info[user_info.type == 'creator']

In [10]:
def successful_creators_followers(follows_received, base_date = datetime.datetime(2016, 5, 30, 0, 0), perc1 = None, perc2 = None, subset_creators = None):
    '''Classifies content creators in successful or unsuccessfull
        Arguments:
                    follows_received: dataframe containing the follows received by content creators
                    base date:        date, in datetime.datetime(YYYY, M, DD, H, M) format, in which the number 
                                      of followers per creator is calculated.
                    perc1:            the threshold used to classify unsuccessful content creators. Creator having 
                                      total followers below the number dictated by this threshold, at the base date,
                                      are classified as unsuccessful 
                    perc2:            the threshold used to classify successful content creators. Creator having 
                                      total followers above the number dictated by this threshold, at the base date,
                                      are classified as successful
                    subset_creators:  a pd.DataFrame containing the creators. If is it available, it will be used to 
                                      filter out non creators and to make sure creators with 0 followers are part of
                                      the resulting dataset.
        
    '''
    print(base_date)

    if 'inbound_activity' not in follows_received.columns:
        follows_received.columns = ['fan_id', 'user_id', 'date_sent']

    mask = (follows_received['date_sent'] < base_date)

    df = follows_received[mask].groupby('user_id', as_index=False).agg({'fan_id': pd.Series.nunique})
    df.columns = ['user_id', 'followers']

    if type(subset_creators) == pd.DataFrame:
        subset_creators = pd.DataFrame(subset_creators.user_id.unique(), columns = ['user_id'])
        df = subset_creators.merge(df, on = 'user_id', how = 'left')
        df.fillna(0, inplace = True)

    low = np.quantile(df.followers, perc1)
    high = np.quantile(df.followers, perc2)

    print("High influencer boundary: {}".format(high))
    print("Low influencer boundary: {}".format(low))

    mask = (df["followers"] <= low) | (df["followers"] >= high)
    
    unsuccessful_creator_ids = df.loc[df["followers"] <= low].user_id.unique()
    successful_creator_ids = df.loc[df["followers"] >= high].user_id.unique()

    return unsuccessful_creator_ids, successful_creator_ids

In [11]:
unsuccessful_ids, successful_ids = successful_creators_followers(follows_received, 
                                                        perc1 = low_success, perc2 = high_success, subset_creators = creators)


2016-05-30 00:00:00
High influencer boundary: 66.0
Low influencer boundary: 9.0


## Putting together a dataset with the promotional activities made by content creators.

The function `gen_actions_sent_df` creates a dataframe with all the promotional activities that content creators sent to users.

In [12]:
def gen_actions_sent_df(follows_sent, shares_sent, likes_sent, comments_sent, messages_sent, creator_ids = creator_ids):
    '''
    Creates dataframe containing the actions that content creators send to users.
        Attributes:
                    follows_sent:  dataframe with the follows sent by the 35k users.
                    shares_sent:   dataframe with the shares sent by the 35k users.
                    likes_sent:    dataframe with the likes sent by the 35k users.
                    comments_sent: dataframe with the comments sent by the 35k users.
                    messages_sent: dataframe with the messages sent by the 35k users.
                    creator_ids:   list with content creator ids. If not none, is used to
                                   filter out activities from non creators.
    '''
    
    follows_sent['outbound_activity'] = 'follow'
    follows_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    if 'song_id' in shares_sent.columns:
        shares_sent.drop(columns=["song_id"])
    shares_sent = shares_sent[['reposter_id', "owner_id", 'created_at']]
    shares_sent['outbound_activity'] = 'share'
    shares_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    if 'track_id' in likes_sent.columns:
        likes_sent.drop(columns=["track_id"], inplace=True)
    likes_sent['outbound_activity'] = 'like'
    likes_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    if 'track_id' in comments_sent.columns:
        comments_sent.drop(columns=["track_id"], inplace=True)
    comments_sent['outbound_activity'] = 'comment'
    comments_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']

    messages_sent["outbound_activity"] = 'message'
    messages_sent.columns = ['user_id', 'fan_id', 'date_sent', 'outbound_activity']
    df = pd.concat([follows_sent, shares_sent, likes_sent, comments_sent, messages_sent])


    if type(creator_ids) == numpy.ndarray:
        df = df[df['user_id'].isin(creator_ids)]
        
    df['week_yr'] = df.date_sent.dt.strftime('%Y-w%U')
    df = df.loc[df['user_id'] != df['fan_id'],:]

    return df

In [13]:
actions_sent = gen_actions_sent_df(follows_sent, shares_sent, likes_sent, comments_sent,
                                     messages_sent, creator_ids = None)

## Filter only actions that were sent to non-fans

In [14]:
follows_received.columns = ['fan_id', 'user_id', 'date_sent']
followers = follows_received[["fan_id", "user_id", "date_sent"]]
followers.columns = ["fan_id", "user_id", "follower_since"]

actions_sent = actions_sent.merge(followers, right_on = ['user_id', 'fan_id'],
                                      left_on = ['user_id', 'fan_id'], how = 'left')

To obtain the followback responsiveness of users, we need to produce a dataset that exclude actions targetting fans.
We do that using filters based on the date of the action and the date that the user became a fan of the content creator. The resulting dataframe is named `actions_sent_non_fans`.

We then filter only actions that happened before the user follows the content creator.

In [15]:
mask = (actions_sent.date_sent < actions_sent.follower_since) | (actions_sent.follower_since.isnull())
actions_sent_non_fans =  actions_sent[mask]
actions_sent_non_fans['week_yr_date'] = actions_sent_non_fans.week_yr.apply(lambda x: process_date(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actions_sent_non_fans['week_yr_date'] = actions_sent_non_fans.week_yr.apply(lambda x: process_date(x))


## Compute rewards 

The last piece of information that is needed to compute the follow-back responsiveness is a column that indicates whether the a follow back occurrd or not. We must decide on a maximum interval between content creator action and follow-back to correctly attribute an follow to an action. Based on previous analysis, we define that interval to be 1 day. We name a follow that happened within that windowa `reward`. 

The function `create_reward` creates an indicator variable that takes the value 1 if a user followed a content creator in `interval` days from the date the content creator sent him a promotional action.

In [16]:
def create_reward(data_actions_sent, interval):
    # Target Creation
    delta = datetime.timedelta(days = interval)
    mask = (data_actions_sent['follower_since'] <= (data_actions_sent['date_sent'] + delta).dt.floor('d'))

    data_actions_sent.loc[mask, 'reward'] = 1
    mask = data_actions_sent['reward'].isnull()
    data_actions_sent.loc[mask, 'reward'] = 0
    return data_actions_sent

In [17]:
labeled_dataset = create_reward(actions_sent_non_fans,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_actions_sent.loc[mask, 'reward'] = 1


## Outdegree level

The function below classifies users in low/high outdegree

In [18]:
def outdegree_level(data, date, user_info = user_info_1st, d_percentiles = 10):
    
    '''
    This function returns the membership table at date equals `date`. Every user that interacted with the 35k tracked 
    users and entered the platform before `date` is present in the table, even if it has indegree 0.
    arguments:
              data:           the indegree dataset.
              user_info:      the dataset containing all the users that interacted with the 35k users tracked.
              d_percentiles:  the percentiles breaks e.g. d_percentiles = 25 means that user types are the quartiles
                              of the indegree distribution.
    '''
    
    data = data[data.created_at.dt.floor('d')<=date]
    data = data.groupby('sender_id').agg({'size':'sum'}).compute()
    steps = 100//d_percentiles
    
    #merge with user info to obtain users that are not followed by anyone at the current date
    data = user_info_1st.merge(data, left_on = 'user_id', right_on = 'sender_id', how= 'outer')
    data.loc[data['size'].isnull(), 'size'] = 0
    data = data[['user_id', 'size', 'entered_platform']].set_index('user_id')
    
    #filter out users that didnt exist in the current date
    mask = data['entered_platform'].dt.floor('d') <= date
    data = data.loc[mask]
    
    mask = (data['size']>0)
    data.loc[~mask, 'type'] = 0 
    
    #cutpoints ignore users with 0 followers as they will appear in their own category
    cutpoints = np.percentile(data['size'], np.arange(0,100, steps)) 

    for i in range(len(cutpoints)):
        low = cutpoints[i]
        try:
            high = cutpoints[i+1]
        except:
            high = 100000
        
        mask2 = (data[mask]['size']>=low) & (data[mask]['size'] <high) & (data['size']>0)
        data.loc[mask2,'type'] = i + 1 
    return data

In [19]:
dask_outdegree = dd.from_pandas(data_outdegree, npartitions = 3)

In [20]:
last_day =  max(actions_sent.date_sent.dt.floor('d').unique())
outdegree_level = outdegree_level(dask_outdegree, last_day,user_info = user_info_1st, d_percentiles = 2)

In [21]:
outdegree_level['high_outdegree'] = outdegree_level.type.apply(lambda x: 1 if x > 1 else 0)

In the cell below, we create a list with unique ids from users that appear in the oudegree level table. This will later be uses to construct a flow-chart indicating how we lose data based on filters and operations.

In [22]:
receiver_ids = outdegree_level.index.unique()

## Responsiveness (follow-back)

The follow-back responsiveness is calculated as the number rewards per non fan, devided by the number of actions received by them, from the content creators in out dataset.

In [23]:
responsiveness_level = labeled_dataset.groupby('fan_id', as_index = False).agg({'reward':'sum', 'user_id':'count'})
responsiveness_level.columns = ['fan_id','total_rewards', 'total_actions_received']

In [24]:
responsiveness_level['responsiveness_level'] = responsiveness_level['total_rewards']/responsiveness_level['total_actions_received']

Now we need to create a indicator variable that receives the value of 1 if the user is classified as high-reponsiveness and 0 if not. high-responsiveness individuals have `responsiveness_level > 0`. Only 6% of the users fall on that category.

In [25]:
sum(responsiveness_level.responsiveness_level > 0)/responsiveness_level.shape[0]
#only 6% have responsiveness > 0

0.029490774559935904

In [26]:
responsiveness_level['high_responsiveness'] = responsiveness_level.responsiveness_level.apply(lambda x: 1 if x > 0 else 0)

Once more we create an object containing the unique ids of users in the resulting dataset. This will be used in a flow-chart, as explained.

In [27]:
received_actions_ids = responsiveness_level.fan_id.unique()
len(received_actions_ids)

424370

In [28]:
len(set(received_actions_ids).intersection(set(receiver_ids)))
#only 339937 from 424370 that follow at least one of the 35k, received at least one action

339947

## Activity level

The activity level is defined as the number of actions performed by users. It is important to notice that we only observe actions targeting the 35k users that joined in march 2012. We consider this measure a proxy for the real activity level.

Let's begin by creating a dataset with all action received by those 35k users.

In [29]:
comments_received = import_dta(path_dir, "12sample_comments_received.dta");
shares_received = import_dta(path_dir, "12sample_reposts_received.dta");
likes_received = import_dta(path_dir, "12sample_favoritings_received.dta");
messages_received = import_dta(path_dir, "12sample_messages_received.dta");

%%%%%%%%%% 12sample_comments_received.dta %%%%%%%%%%
(21386, 4)
%%%%%%%%%% 12sample_reposts_received.dta %%%%%%%%%%
(83013, 4)
%%%%%%%%%% 12sample_favoritings_received.dta %%%%%%%%%%
(286903, 4)
%%%%%%%%%% 12sample_messages_received.dta %%%%%%%%%%
(17364, 3)


In [30]:
if 'song_id' in shares_received:
        shares_received.drop(columns=["song_id"])
shares_received = shares_received[['reposter_id', "owner_id", 'created_at']]
shares_received['inbound_activity'] = 'share'
shares_received.columns = ['fan_id', 'user_id', 'date_sent', 'inbound_activity']

if 'track_id' in likes_received:
        likes_received = likes_received.drop(columns=["track_id"])
likes_received['inbound_activity'] = 'like'
likes_received.columns = ['fan_id', 'user_id', 'date_sent', 'inbound_activity']

if 'track_id' in comments_received:
        comments_received = comments_received.drop(columns=["track_id"])
comments_received['inbound_activity'] = 'comment'
comments_received.columns = ['fan_id', 'user_id', 'date_sent', 'inbound_activity']

messages_received["outbound_activity"] = 'message'
messages_received.columns = ['user_id', 'fan_id', 'date_sent', 'inbound_activity']

In [31]:
df_total_actions_by_fans_and_non_fans = pd.concat([shares_received, likes_received, comments_received, messages_received])

In [32]:
activity_level = df_total_actions_by_fans_and_non_fans.groupby('fan_id', as_index = True).size()

In [33]:
activity_level = activity_level.to_frame()
activity_level.columns = ['activity']
#number of users that performed at least one non-follow activity.
sum(activity_level.activity > 0)

240292

Once more we create an object containing the unique ids of users in the resulting dataset. This will be used in a flow-chart, as explained.

In [34]:
activity_ids = activity_level.index.unique()

In [35]:
len(set(activity_ids).intersection(set(receiver_ids)))
#only 35493 from 240292 that follow at least one of the 35k, performed at least one non-follow action

35493

The number of users that received at least on action from our 35k users surpasses by much the number of users that performed at least on action.

In the cell bellow, we input information on users that performed 0 activities.

In [36]:
activity_level = activity_level.reindex(receiver_ids)
activity_level.loc[activity_level.activity.isna(), 'activity'] = 0

In [37]:
sum(activity_level.activity >= 4)/activity_level.shape[0]
#only 1% have activity >= 4

0.010386942300065897

In [38]:
activity_level['high_activity'] = activity_level.activity.apply(lambda x: 1 if x >= 4 else 0)

# Analysis

Only users that followed at least one of the 35k, received at least one action fom the 35k and did at least one non-follow actions towards the 35k are eligible to be in the table.

The reason behind it is that we dont observe outdegree of users that did not follow at least one of the 35k and we dont observe the follow-back responsiveness of users that never received actions from the 35k.

Below, we get a list with the eligible user's ids

In [39]:
eligible_ids = set(received_actions_ids).intersection(set(receiver_ids))

In [40]:
len(eligible_ids)

339947

Now we merge the datasets with the outdegree, follow-back responsiveness and activity information.

In [41]:
table_data = outdegree_level.merge(responsiveness_level, left_index = True, right_on = 'fan_id', how = 'inner').merge(activity_level, left_on = 'fan_id', right_index = True, how = 'inner')

In [42]:
table_data.groupby(['high_outdegree', 'high_responsiveness', 'high_activity']).size()

high_outdegree  high_responsiveness  high_activity
0               0                    0                192658
                                     1                   419
                1                    0                  2940
                                     1                    63
1               0                    0                133450
                                     1                  1093
                1                    0                  9035
                                     1                   289
dtype: int64

And, finally, we create the 4 user groups that we are interested in: *Hardcore*, *Lazy*, *Addicted* and *Shallow*. Everyone else is classified as *Other*.

In [43]:
table_data['Type'] = table_data.apply(lambda x: 
    'hardcore' if (x.high_outdegree == 0) & (x.high_responsiveness == 1) & (x.high_activity == 1)
     else ('lazy' if (x.high_outdegree == 0) & (x.high_responsiveness == 1) & (x.high_activity == 0)
     else ('shallow' if (x.high_outdegree == 1) & (x.high_responsiveness == 0) & (x.high_activity == 0) 
     else ('addicted' if (x.high_outdegree == 1) & (x.high_responsiveness == 0) & (x.high_activity == 1)
     else 'other'))), axis=1)

Finally, we answer 4 items:

    1. Is there statistical difference on the activity level of hardcore and lazy users?
    2. Is there statistical difference on the activity level of addicted and shallow users?
    3. Descriptive statistics of the four groups.
    4.a. Are hardcore and addicted users more connected to successful creators than lazy and shallow?
    4.b. Are hardcore and addicted users more connected to successful creators than to unsuccessful creators?

## Desc. Statistics

Let's start with 3, since if provide the figures we are going to use to answer to other items:

### 3. Descriptive statistics of the four groups.

In [44]:
table_data.groupby('Type', as_index = False).activity.describe().unstack(1)

Unnamed: 0,0,1,2,3,4
Type,addicted,hardcore,lazy,other,shallow
count,1093.0,63.0,2940.0,202401.0,133450.0
mean,15.246112,7.634921,0.160204,0.060751,0.052716
std,125.191943,5.261858,0.524183,1.698298,0.303226
min,4.0,4.0,0.0,0.0,0.0
25%,4.0,4.0,0.0,0.0,0.0
50%,6.0,6.0,0.0,0.0,0.0
75%,10.0,9.0,0.0,0.0,0.0
max,4025.0,30.0,3.0,623.0,3.0


### 1. Is there statistical difference on the activity level of hardcore and lazy users?

In [45]:
import scipy

In [46]:
group1 = table_data[table_data['Type'] == 'hardcore']
group2 = table_data[table_data['Type'] == 'lazy']

#t, p =
scipy.stats.ttest_ind(group1.activity, group2.activity, equal_var = False)

Ttest_indResult(statistic=11.274043837398452, pvalue=1.1771457291033545e-16)

Yes, there is the activity level of hardcore and lazy users is statistically different.

### 2. Is there statistical difference on the activity level of addicted and shallow users?

In [47]:
group1 = table_data[table_data['Type'] == 'addicted']
group2 = table_data[table_data['Type'] == 'shallow']

#t, p =
scipy.stats.ttest_ind(group1.activity, group2.activity, equal_var = False)

Ttest_indResult(statistic=4.012254982280351, pvalue=6.423952914982856e-05)

Yes, there is the activity level of addicted and shallow users is statistically different.

### 4.a. Are hardcore and addicted fans more connected to successful creators than lazy and shallow?

Let's add two columns to the `actions_sent_non_fans` dataset, the fan type, containing the user classification (*lazy, hardore, shallow, addicted*) and the content creator classification (*successful/unsuccesful*).

In [48]:
## classify users

hardcore_ids = table_data.loc[table_data.Type =='hardcore'].fan_id.unique()
lazy_ids = table_data.loc[table_data.Type =='lazy'].fan_id.unique()
shallow_ids = table_data.loc[table_data.Type =='shallow'].fan_id.unique()
addicted_ids = table_data.loc[table_data.Type =='addicted'].fan_id.unique()


actions_sent_non_fans['user_type'] = actions_sent_non_fans.fan_id.apply(lambda x: 'hardcore' if x in hardcore_ids else 
                          ('lazy' if x in lazy_ids else
                          ('shallow' if x in shallow_ids else
                          ('addicted' if x in addicted_ids else 'other'))))

## classify content creators
actions_sent_non_fans['creator_type'] = actions_sent_non_fans.user_id.apply(
                                            lambda x: 'successful' if x in successful_ids else 
                                            ('unsuccessful' if x in unsuccessful_ids else 'other'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actions_sent_non_fans['user_type'] = actions_sent_non_fans.fan_id.apply(lambda x: 'hardcore' if x in hardcore_ids else
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actions_sent_non_fans['creator_type'] = actions_sent_non_fans.user_id.apply(


We now calculate the number of unique users targeted by creators that fall within each user category:

In [49]:
mask = (actions_sent_non_fans.creator_type != 'other') #& (actions_sent_non_fans.user_type != 'other')
distribution_target_user_type = actions_sent_non_fans.loc[mask].groupby(['creator_type', 'user_type']).fan_id.nunique()
dist_target_user = distribution_target_user_type.to_frame().reset_index()

## Odds Ratio Analysis

### 4.a. Are hardcore and addicted users more connected to successful creators than lazy and shallow?

We want to measure if successful creators are skilled in identifying *addicted* and *hardcore* users. Unfortunatelly,  we cannot use a probability measure to do the comparison, since it is sharply affected by group size variations. One measure that sidestep this issue is the odds ratio, a statistic that is better suited to compare groups of different sizes because it takes the number of events and non events in each group into account. 

To calculate the odds ratio of for a given user type, say *hardcore*, we need the probability that a *hardcore* user is targeted by a successful creator. That is obtained by the following ratio: `p_hardcore_users` = `total unique users of type hardcore target by successful creators`/`total unique users of type hardcore`. 

We then use it to calculate: `odds_hardcore_user` = `p_hardcore_users`/(1-`p_hardcore_users`)

Finally, let's say we want to compare `hardcore` and `lazy` users. We would then use the meetric `odds_ratio` = `odds_hardcore_user`/`odds_lazy_user`

### Hardcore vs Lazy

In [50]:
from scipy.stats import chi2_contingency

In [51]:
table = np.array([[20, 546], [len(hardcore_ids)-20, len(lazy_ids)-546]])

# calculate the odds ratio by taking the ratio of the odds of the event occurring in the two groups
odds_ratio = (20/(len(hardcore_ids)-20)/(546/(len(lazy_ids)-546)))
print("Odds Ratio:", round(odds_ratio,4))

# perform a chi-square test to determine whether the observed odds ratio is statistically significant
chi2, p_value, _, _ = chi2_contingency(table)
print("Chi-Square Statistic:", round(chi2,4))
print("P-Value:", round(p_value,4))

Odds Ratio: 2.0394
Chi-Square Statistic: 6.1643
P-Value: 0.013


Successful creators show better odds of connecting to *hardcore* than to *lazy* users.

### Addicted vs Shallow

In [52]:
table = np.array([[798, 55614], [len(addicted_ids)-798, len(shallow_ids)-55614]])

# calculate the odds ratio by taking the ratio of the odds of the event occurring in the two groups
odds_ratio = (798/(len(addicted_ids)-798)/(55614/(len(shallow_ids)-55614)))
print("Odds Ratio:", round(odds_ratio,4))

# perform a chi-square test to determine whether the observed odds ratio is statistically significant
chi2, p_value, _, _ = chi2_contingency(table)
print("Chi-Square Statistic:", round(chi2,4))
print("P-Value:", round(p_value,4))

Odds Ratio: 3.786
Chi-Square Statistic: 435.9268
P-Value: 0.0


Successful creators show better odds of connecting to *addicted* than to *shallow* users.

### 4.b. Are hardcore and addicted users more connected to successful creators than to unsuccessful creators?

### Addicted Users

In [53]:
table = np.array([[798, 48], [len(addicted_ids)-798, len(addicted_ids)-48]])

# calculate the odds ratio by taking the ratio of the odds of the event occurring in the two groups
odds_ratio = (798/(len(addicted_ids)-798)/(48/(len(addicted_ids)-48)))
print("Odds Ratio:", round(odds_ratio,4))

# perform a chi-square test to determine whether the observed odds ratio is statistically significant
chi2, p_value, _, _ = chi2_contingency(table)
print("Chi-Square Statistic:", round(chi2,4))
print("P-Value:", round(p_value,4))

Odds Ratio: 58.8919
Chi-Square Statistic: 1081.7792
P-Value: 0.0


*Harcore* users are more connected to successful creators than to unsuccessful creators.

### Hardcore Users

In [54]:
table = np.array([[20,6], [len(hardcore_ids)-20, len(hardcore_ids)-6]])

# calculate the odds ratio by taking the ratio of the odds of the event occurring in the two groups
odds_ratio = (20/(len(hardcore_ids)-20)/(6/(len(hardcore_ids)-6)))
print("Odds Ratio:", round(odds_ratio,4))

# perform a chi-square test to determine whether the observed odds ratio is statistically significant
chi2, p_value, _, _ = chi2_contingency(table)
print("Chi-Square Statistic:", round(chi2,4))
print("P-Value:", round(p_value,4))

Odds Ratio: 4.4186
Chi-Square Statistic: 8.19
P-Value: 0.0042


*Addicted* users are more connected to successful creators than to unsuccessful creators

In [55]:
test = outdegree_level.merge(responsiveness_level, left_index = True, right_on = 'fan_id', how = 'inner').merge(activity_level, left_on = 'fan_id', right_index = True, how = 'inner')

In [56]:
table_data.loc[table_data.Type == 'hardcore'].sort_values('entered_platform')

Unnamed: 0,size,entered_platform,type,high_outdegree,fan_id,total_rewards,total_actions_received,responsiveness_level,high_responsiveness,activity,high_activity,Type
36521,39.0,2010-07-30 05:55:49,1.0,0,1420383,1.0,1,1.000000,1,4.0,1,hardcore
73197,48.0,2011-04-17 08:46:39,1.0,0,4229778,1.0,1,1.000000,1,8.0,1,hardcore
91638,5.0,2011-07-12 09:07:09,1.0,0,5913127,1.0,1,1.000000,1,4.0,1,hardcore
92057,30.0,2011-07-14 00:23:37,1.0,0,5951590,1.0,1,1.000000,1,6.0,1,hardcore
94060,13.0,2011-07-22 15:27:09,1.0,0,6145359,2.0,2,1.000000,1,4.0,1,hardcore
...,...,...,...,...,...,...,...,...,...,...,...,...
417878,34.0,2014-11-08 07:21:35,1.0,0,121912221,1.0,3,0.333333,1,6.0,1,hardcore
419716,5.0,2014-12-19 19:01:15,1.0,0,128561534,1.0,1,1.000000,1,4.0,1,hardcore
420441,25.0,2015-01-07 23:08:08,1.0,0,131861993,1.0,1,1.000000,1,12.0,1,hardcore
422708,24.0,2015-03-19 15:20:30,1.0,0,144029054,2.0,2,1.000000,1,4.0,1,hardcore
