# Account-Level Sampler

__Author__: Cody Buntain

__Date__: 18 August 2020

__Purpose__: This notebook will extract a sample of messages from each account in a given dataset's CSV file.

__Input__: A CSV file containing posts and accounts, a percentage of messages to pull from each account, and a date range in which we should look.

__Output__: A new CSV file with the sampled data.

In [1]:
import datetime
import pytz
import pandas as pd

In [2]:
sample_ratio = 0.3
date_start = datetime.datetime(month=3, day=11, year=2020, tzinfo=pytz.UTC)
date_end = datetime.datetime(month=5, day=20, year=2020, tzinfo=pytz.UTC)

In [3]:
df = pd.read_csv(
    "all_structure_posts.csv", 
    dtype={"PlatformPostID": str},
    parse_dates=["TimestampPosted"]
)

In [4]:
# Make sure we have a timezone across all data
#. Necessary because of differences in FB's and Twitter's timestamp formats
def apply_tz(ts):
    if ts.tzinfo is None:
        ts = pytz.UTC.localize(ts)
    return ts

dt = df["TimestampPosted"]
df["TimestampPosted"] = dt.apply(apply_tz)


In [5]:
relevant_df = df[(df["TimestampPosted"] >= date_start) & (df["TimestampPosted"] <= date_end)]

In [6]:
def sample_platform(this_df, sample_rate):
    sampled_frames = []
    for account, group_df in this_df.groupby("AccountPlatformId"):
        sampled_frames.append(group_df.sample(frac=sample_ratio, replace=False))
    return pd.concat(sampled_frames)

relevant_df_fb = relevant_df[relevant_df["Platform"] == "Facebook"]

# We only care about tweets, not retweets or replies
relevant_df_tw = relevant_df[relevant_df["Platform"] == "Twitter"]
relevant_df_tw = relevant_df_tw[relevant_df_tw["PostType"] == "tweet"]

relevant_sample_df_fb = sample_platform(relevant_df_fb, sample_ratio)
relevant_sample_df_tw = sample_platform(relevant_df_tw, sample_ratio)


In [7]:
sampled_df = pd.concat([relevant_sample_df_fb, relevant_sample_df_tw])
sampled_df.to_csv("sampled_structure_posts.csv", index=False)

## Diagnostics

Check to make sure we hit the sample rate for each group.

In [8]:
for platform in ["Facebook", "Twitter"]:
    print(platform)
    
    this_full_df = relevant_df[relevant_df["Platform"] == platform]
    
    # Special filtering for tweet post types
    if platform == "Twitter":
        this_full_df = this_full_df[this_full_df["PostType"] == "tweet"]
    
    this_samp_df = sampled_df[sampled_df["Platform"] == platform]
    
    # Calculate the overall ratio
    ratio = this_samp_df.shape[0] / this_full_df.shape[0]
    print(this_full_df.shape[0], "vs", this_samp_df.shape[0], ratio)
    
    # For each account, check to make sure we get ~sample_ratio tweets
    for account, group in this_full_df.groupby("AccountPlatformId"):
        sampled_count = this_samp_df[this_samp_df["AccountPlatformId"] == account].shape[0]
        account_ratio = sampled_count / group.shape[0]
        
        if account_ratio <= sample_ratio - 0.05 or account_ratio >= sample_ratio + 0.05:
            print(account, sampled_count, group.shape[0], sampled_count / group.shape[0])

    print("-" * 20)

Facebook
3312 vs 992 0.2995169082125604
EsteeLauder 1 4 0.25
Honeywell 2 5 0.4
MichelinUSA 2 5 0.4
RalphLauren 1 4 0.25
SamsungUS 1 2 0.5
VisaUnitedStates 1 2 0.5
britishairways 2 8 0.25
campbells 2 8 0.25
cathaypacific 0 1 0.0
--------------------
Twitter
6514 vs 1957 0.3004298434141848
InterConHotels 1 2 0.5
Nike 1 4 0.25
Tesla 2 5 0.4
Visa 1 2 0.5
adidasUS 1 4 0.25
--------------------
