A little line to copy the data from the raspberry pi to the local machine

In [None]:
# scp brk@192.168.8.122:/home/brk/projects/project1/data/tiktoks.jsonlines ../data/tiktoks.jsonlines

## Imports and read in data

In [None]:
import pandas as pd
# Don't truncate long URLs:
pd.set_option('display.max_colwidth', None)
import numpy as np
from datetime import timedelta
import datetime
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
from matplotlib.ticker import EngFormatter

df = pd.read_json("../data/tiktoks.jsonlines", lines=True)

Add some useful columns

In [None]:
# There's some erroneous data before 2022-07-29 -> exclude it
df = df[df.scraped_at > '2022-07-29']

# Include a column that only has the hour at which the tiktok was scraped
df['scraped_dt'] = df.sort_values('scraped_at').scraped_at.apply(
    lambda dt: pd.Timestamp(dt - timedelta(minutes=dt.minute, seconds=dt.second))
)

# Rename 'username' to 'creator'
df.rename(columns={
    'username':'creator',
    'user_followers':'creator_followers',
    'user_likes':'creator_likes',
}, inplace=True)

# Remove the full url from the creators name
df['creator_url'] = df['creator']
df['creator'] = df['creator'].replace(to_replace=r'^https://www.tiktok.com/', value='', regex=True)

# Get rid of TikTok's buffer-overflow bug
df['creator_likes'] = df['creator_likes'].apply(lambda x: x if x >= 0 else (2**32 + x))

# Create a useful metric: like_comment_ratio
df['like_comment_ratio'] = df['likes'] / df['comments']

# Create various rate variables iff we see the same video more than once
# Calculate the change in time, as a pd.Timestamp
time_diff = df.sort_values('scraped_at').groupby('url')['scraped_at'].apply(lambda df: df.diff())
# Caluclate the change in hours, as a decimal
# df['value_diff'] = time_diff.apply(lambda x: x.value)
df['hours_diff'] = time_diff.apply(lambda x: np.nan if pd.isna(x) else x.value / 60 / 60 / 1_000_000_000)

# Make a little helper function to calculate the frequency of a given column per hour
def calc_x_per_hour(x, df):
    # Calculate the change in `x`
    diff = df.sort_values('scraped_at').groupby('url')[x].apply(lambda df: df.diff())
    # If a video is at 12.3M, we'll only next get shown a change when it increases to 12.4M
    # Therefore replace all 0 differences with NaNs so we don't plot loads of zeros as a video
    # increases in popularity
    diff = diff.replace(0, np.nan)
    # Calculate the number of `x` per hour
    df[f'{x}_per_hour'] = diff / df['hours_diff']
    return df

df = calc_x_per_hour('likes', df)
df = calc_x_per_hour('comments', df)
df = calc_x_per_hour('creator_followers', df)
df = calc_x_per_hour('creator_likes', df)

df = df[[
    'scraped_at', 
    'scraped_dt',
    'hours_diff',
    'url', 
    'audio', 
    'likes', 
    'likes_per_hour',
    'comments',
    'comments_per_hour', 
    'like_comment_ratio', 
    'creator',
    'creator_url', 
    'creator_followers', 
    'creator_followers_per_hour',
    'creator_likes', 
    'creator_likes_per_hour',
    'audio_url',
]]

Get some cumulative statistics for the audios

In [None]:
# Get a copy of the df with just the columns we need
audio_cumul_count = df[['scraped_dt', 'scraped_at', 'url', 'audio', 'audio_url','creator', 'creator_url', 'likes', 'comments']].copy()
# Create a groupby on the audio, sorted by scraped_dt
gb = audio_cumul_count.sort_values('scraped_dt').groupby('audio_url')
# Get the cumulative count of how often a given audio was used by any video

def apply_df(df):
    print(df.audio.iloc[0][0], end='')
    seen = pd.DataFrame(columns=df.columns)
    cumul_count = [0]
    cumul_likes = [0]
    cumul_comments = [0]
    for _, row in df.iterrows():
        mask = (seen['url'] == row['url'])
#         print(mask)
        if mask.any():
            cumul_count.append(cumul_count[-1])
            prev_row = seen[mask] \
                            .sort_values('scraped_at', ascending=True) \
                            .tail(1).iloc[0]
            cumul_likes.append(cumul_likes[-1] + (row['likes'] - prev_row['likes']))
            cumul_comments.append(cumul_comments[-1] + (row['comments'] - prev_row['comments']))
        else:
            cumul_count.append(cumul_count[-1] + 1)
            cumul_likes.append(cumul_likes[-1] + row['likes'])
            cumul_comments.append(cumul_comments[-1] + row['comments'])

        seen = pd.concat([seen, row])
#         seen = seen.append(row)

    df['cumul_count'] = cumul_count[1:]
    df['cumul_likes'] = cumul_likes[1:]
    df['cumul_comments'] = cumul_comments[1:]
    return df

print("Calculating cumulative statistics. This may take a while.")
df[['cumul_count', 'cumul_likes', 'cumul_comments']] = gb.apply(apply_df)[['cumul_count', 'cumul_likes', 'cumul_comments']]
print('\ndone.')


Print out some statistics

In [None]:
unique_days = df.sort_values('scraped_at').scraped_at.dt.date.unique()
uniq_days_str = ''.join(['\n- ' + str(d) for d in unique_days])

most_followed = list(df.sort_values('creator_followers', ascending=False).creator.head(1))[0]
print(f"""
Number of unique TikToks:  {len(df.url.unique())}
Number of unique audios:   {len(df.audio_url.unique())}
Number of unique creators: {len(df.creator.unique())}
Most popular audio: {df.audio_url.value_counts().index[0]}
Most followed creator: {most_followed}
Scraped data on the following days:{uniq_days_str}
""")

# Define common functions

In [None]:
def setup_ax(
    ax, 
    title=None, 
    ylabel=None, 
    xlabel='Date',
    engineering=0,
    legend_cols=1,
    legend=True,
    start=None,
    end=None,
):
    if type(engineering) is bool and engineering:
        engineering = 0
    if type(engineering) is int: 
        ax.yaxis.set_major_formatter(EngFormatter(places=engineering))

    if ylabel is not None:
        ax.set_ylabel(ylabel)
    if xlabel is not None:
        ax.set_xlabel(xlabel)
    date_range = pd.date_range(
        start=start or df.scraped_dt.dt.date.min(), 
        end=end or df.scraped_dt.dt.date.max(), 
        inclusive='both'
    )
    ax.set_xticks(date_range)
#     ax.set_xticklabels(date_range)
    ax.tick_params(axis='x', rotation=90)
    if legend:
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.4), ncol=legend_cols)
    if title is not None:
        ax.set_title(title)
    return ax

In [None]:
def value_over_time(
    item,
    value,
    num_items=5,
    print_top=False,
    show_change=True
):
    if show_change:
        delta = df.groupby(item)[value].max() - df.groupby(item)[value].min()
        # Get the top num_items
        topX = delta.sort_values(ascending=False).head(num_items)
    else:
        topX = df.groupby(item)[value].max().sort_values(ascending=False).head(num_items)
    if print_top: print(topX)
    # Convert the URLs to full tiktoks
    increasing = df[df[item].isin(topX.index)].copy()

    if show_change:
        item_mins = df.groupby(item)[value].min()
        increasing[value] = increasing.apply(
            lambda row: row[value] - item_mins[row[item]], 
            axis=1
        )

    ax = sns.lineplot(
        data=increasing,
        x='scraped_dt',
        y=value,
        hue=item,
        legend=True,
        hue_order=topX.index
    )

    ax = setup_ax(
        ax, 
        f'{"Change in" if show_change else "Total of"} {value} for the top {num_items} {item}s',
        f'{"Change in" if show_change else "Total of"} {value}',
    )

#     plt.show()

## TODO:
- correlation between:
    - followers and average video posting frequency

## Most prolific tiktokers
Look at those who posted the most TikToks

In [None]:
features = [
    'creator_likes',
    'creator_followers',
#     'creator_likes_per_hour',
#     'creator_followers_per_hour',
]
def feature_to_title(f):
    return f.replace('_', ' ').replace('creator ', '').title()

for feature in features:
    title = feature_to_title(feature)
    ncols, nrows = 2, 3
    num_creators = ncols * nrows
    no_dup_urls = df.sort_values('scraped_at', ascending=True).drop_duplicates('url', keep='first')

    fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5*ncols, 5*nrows))
    # Get the topX creators by number of videos posted
    top_creators = no_dup_urls.value_counts('creator').head(num_creators)

    for i, ax in enumerate(axs.flatten()):
        creator = top_creators.index[i]
        # Lineplot to show the `feature`
        sns.lineplot(
            data=df[df['creator'] == creator],
            x="scraped_at", 
            y=feature,
            ax=ax
        )
        # Rugplot to show all times we saw a TikTok
        sns.rugplot(
            data=df[df['creator'] == creator],
            x="scraped_at",
            ax=ax,
        )
        # Rugplot to show the *first* time we saw a TikTok
        sns.rugplot(
            data=no_dup_urls[no_dup_urls['creator'] == creator],
            x="scraped_at",
            ax=ax,
            height=.05,
            lw=2
        )

        n_tiktoks = top_creators[creator]
        start = df.scraped_dt.dt.date.min()
        end = df.scraped_dt.dt.date.max()

        setup_ax(
            ax,
            xlabel=f'Date ({start} to {end})',
            ylabel=f'Total Creator {title}',
            title=f'{top_creators.index[i]} {title} over time\n({n_tiktoks} TikToks total)',
            legend=False,
            engineering=1,
        )
        ax.set_xticks([])
    plt.tight_layout()
    plt.savefig(f'../report/img/most_prolific_tiktokers_{feature}.pdf')
    plt.show()
    print('TikTokers creating the most content')

## Most Followed TikTokers
Look at those who have the largest absolute following

In [None]:
features = [
    'creator_likes',
    'creator_followers',
#     'creator_likes_per_hour',
#     'creator_followers_per_hour',
]
def feature_to_title(f):
    return f.replace('_', ' ').replace('creator ', '').title()

for feature in features:
    title = feature_to_title(feature)
    ncols, nrows = 2, 3
    num_creators = ncols * nrows
    no_dup_urls = df.sort_values('scraped_at', ascending=True).drop_duplicates('url', keep='first')

    fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5*ncols, 5*nrows))
    # Get the topX creators by number of videos posted
    top_creators = df.groupby('creator')['creator_followers'] \
                    .max() \
                    .sort_values(ascending=False) \
                    .head(num_creators)

    for i, ax in enumerate(axs.flatten()):
        creator = top_creators.index[i]
        data = df[df['creator'] == creator]
        sns_func = sns.scatterplot if len(data) <= 1 else sns.lineplot
        sns_func(
            data=data,
            x="scraped_at", 
            y=feature,
            ax=ax
        )
        sns.rugplot(
            data=data,
            x="scraped_at",
            ax=ax,
        )        
        # Rugplot to show the *first* time we saw a TikTok
        sns.rugplot(
            data=no_dup_urls[no_dup_urls['creator'] == creator],
            x="scraped_at",
            ax=ax,
            height=.05,
            lw=2
        )

        n_tiktoks = top_creators[creator]
        start = df.scraped_dt.dt.date.min()
        end = df.scraped_dt.dt.date.max()

        setup_ax(
            ax,
            xlabel=f'Date ({start} to {end})',
            ylabel=f'Total Creator {title}',
            title=f'{top_creators.index[i]} {title} over time\n({n_tiktoks} TikToks total)',
            legend=False,
            engineering=1,
        )
        ax.set_xticks([])
    plt.tight_layout()
    plt.savefig(f'../report/img/most_followed_tiktokers_{feature}.pdf')
    plt.show()
    print('TikTokers with the most followers')

## TikTokers with the greatest increase in followers

In [None]:
features = [
    'creator_likes',
    'creator_followers',
    'creator_likes_per_hour',
    'creator_followers_per_hour',
]
def feature_to_title(f):
    return f.replace('_', ' ').replace('creator ', '').title()

for feature in features:
    title = feature_to_title(feature)
    ncols, nrows = 2, 3
    num_creators = ncols * nrows
    no_dup_urls = df.sort_values('scraped_at', ascending=True).drop_duplicates('url', keep='first')

    fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5*ncols, 5*nrows))
    delta = df.groupby('creator')['creator_followers'].max() - df.groupby('creator')['creator_followers'].min()
    # Get the top num_items
    top_creators = delta.sort_values(ascending=False).head(num_creators)

    for i, ax in enumerate(axs.flatten()):
        creator = top_creators.index[i]
        data = df[df['creator'] == creator]
        sns_func = sns.scatterplot if len(data) <= 1 else sns.lineplot
        sns_func(
            data=data,
            x="scraped_at", 
            y=feature,
            ax=ax,
        )
        
        ax.set(ylim=(
            df[df['creator'] == creator][feature].min() * 0.99,
            df[df['creator'] == creator][feature].max() * 1.01
        ))
        
        # Rugplot to show the *every* time we saw a TikTok
        sns.rugplot(
            data=df[df['creator'] == creator],
            x="scraped_at",
            ax=ax,
        )        
        # Rugplot to show the *first* time we saw a TikTok
        sns.rugplot(
            data=no_dup_urls[no_dup_urls['creator'] == creator],
            x="scraped_at",
            ax=ax,
            height=.05,
            lw=2
        )

        n_likes = top_creators[creator]
        start = df.scraped_dt.dt.date.min()
        end = df.scraped_dt.dt.date.max()

        setup_ax(
            ax,
            xlabel=f'Date ({start} to {end})',
            ylabel=f'Total Creator {title}',
            title=f'{top_creators.index[i]} {title} over time\n(increase of {n_likes} followers)',
            legend=False,
            engineering=1,
        )
        ax.set_xticks([])
    plt.tight_layout()
    plt.savefig(f'../report/img/highest_delta_followers_tiktokers_{feature}.pdf')
    plt.show()
    print('TikTokers with the most followers')

# TikToks with the greatest increase in likes

In [None]:
top_tiktoks = (df.groupby('url')['likes'].max() - df.groupby('url')['likes'].min()) \
                .sort_values(ascending=False) \
                .head(10)
top_tiktoks

In [None]:
features = [
    'likes',
    'comments',
#     'likes_per_hour',
#     'followers_per_hour',
]

def feature_to_title(f):
    return f.replace('_', ' ').replace('creator ', '').title()

for feature in features:
    title = feature_to_title(feature)
    ncols, nrows = 2, 3
    num_tiktoks = ncols * nrows

    fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
    top_tiktoks = (df.groupby('url')['likes'].max() - df.groupby('url')['likes'].min()) \
                    .sort_values(ascending=False) \
                    .head(num_tiktoks)
    
    for i, ax in enumerate(axs.flatten()):
        tiktok = top_tiktoks.index[i]
        data = df[df['url'] == tiktok]
        sns_func = sns.scatterplot if len(data) <= 1 else sns.lineplot
        sns_func(
            data=data,
            x="scraped_at", 
            y=feature,
            ax=ax,
        )
        
        sns.rugplot(
            data=data,
            x="scraped_at",
            ax=ax,
        )

        n = top_tiktoks[tiktok]
        start = df.scraped_dt.dt.date.min()
        end = df.scraped_dt.dt.date.max()
        tt_title = top_tiktoks.index[i] \
            .replace('https://www.tiktok.com/', '')
        setup_ax(
            ax,
            xlabel=f'Date ({start} to {end})',
            ylabel=f'Total TikTok {title}',
            title=f'{tt_title}\n {title} over time ({n} likes)',
            start=start,
            end=end,
            legend=False,
            engineering=1,
        )
        ax.set_xticklabels([])
    plt.tight_layout()
    plt.savefig(f'../report/img/most_likes_tiktoks_{feature}.pdf')
    plt.show()
    print('TikTokers with the most followers')

# TikTok audios with the greatest cumulative likes

In [None]:
import warnings
warnings.filterwarnings("ignore")


ncols, nrows = 2, 3
num_audios = ncols * nrows

fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))

cumul_count = df.groupby('audio_url')['cumul_count'].max().sort_values(ascending=False)
cumul_likes = df.groupby('audio_url')['cumul_likes'].max().sort_values(ascending=False)
cumul_comments = df.groupby('audio_url')['cumul_comments'].max().sort_values(ascending=False)

# top_avg_likes = ((cumul_likes / cumul_count) * (cumul_count > 1)).sort_values(ascending=False)
order_by = cumul_likes

for i, ax in enumerate(axs.flatten()):
    audio = order_by.index[i]
    print(audio)
    data = df[df['audio_url'] == audio]
    sns_func = sns.scatterplot if len(data) <= 1 else sns.lineplot
    sns_func(
        data=data,
        x="scraped_at", 
        y='cumul_likes',
        ax=ax,
    )

    sns.rugplot(
        data=data,
        x="scraped_at",
        ax=ax,
        expand_margins=False,
    )
    no_dup_urls = df.sort_values('scraped_at', ascending=True).drop_duplicates('url', keep='first')

    # Rugplot to show the *first* time we saw a TikTok
    sns.rugplot(
        data=no_dup_urls[no_dup_urls['audio_url'] == audio],
        x="scraped_at",
        ax=ax,
#         lw=2,
        expand_margins=False,
    )
    
    audio_name = list(df[df['audio_url'] == audio].audio)[0]
    audio_url = audio.replace('https://www.tiktok.com/music/', '')

    start = df.scraped_dt.dt.date.min()
    end = df.scraped_dt.dt.date.max()

    setup_ax(
        ax,
        xlabel=f'Date ({start} to {end})',
        ylabel=f'Cumulative Likes',
        title=f'{audio_name}\n({audio_url})',
        start=start,
        end=end,
        legend=False,
        engineering=1,
    )
    ax.set_xticklabels([])
plt.tight_layout()
plt.savefig(f'../report/img/cumul_likes_for_most_liked_audio.pdf')
plt.show()
print('Audio with the most likes per video')

In [None]:
value_over_time('audio_url', 'cumul_count', num_items=10, print_top=True)
plt.savefig(f'../report/img/change_in_cumul_count_for_most_used_audio.pdf', bbox_inches='tight')
plt.show()


# Answer questions

## General:
- Total number of observations
- Number of unique creators
- Number of unique audios
- Number of unique tiktoks

In [None]:
"~" + str(len(df)//1000) + "k total observations"

In [None]:
str(len(df['creator'].unique())) + " different creators"

In [None]:
str(len(df['audio'].unique())) + " different audios"

In [None]:
str(len(df['url'].unique())) + " different TikToks"

### Change in likes vs change in followers
- TODO: Show the change in followers only for the viral videos
- TODO: show percentage change as well as absolute change

In [None]:
value_over_time('url', 'likes', num_items=10, print_top=True)
plt.savefig('../report/img/change_in_likes_top_tiktoks.pdf', bbox_inches='tight')
plt.show()
value_over_time('url', 'comments', num_items=10, print_top=False)
plt.savefig('../report/img/change_in_comments_top_tiktoks.pdf', bbox_inches='tight')
plt.show()


In [None]:
value_over_time('creator_url', 'creator_followers', num_items=10, print_top=False)
plt.savefig('../report/img/change_in_followers_top_creators.pdf', bbox_inches='tight')
plt.show()

#### Greatest increase in follower count

In [None]:
value_over_time('creator_url', 'creator_followers', num_items=10)
value_over_time('creator_url', 'creator_likes', num_items=10)

#### Like:comment ratio over time

In [None]:
value_over_time('creator_url', 'like_comment_ratio', num_items=10, show_change=False)

#### Relation between comments and likes per url over time

In [None]:
def x_and_y_over_time(
    item,
    y,
    x,
    num_items=10,
    print_top=False,
    logx=False,
    logy=False
):
    delta = df.groupby(item)[x].max() - df.groupby(item)[x].min()
    # Get the top num_items
    topX = delta.sort_values(ascending=False).head(num_items)
    if print_top: print(topX)
    # Convert the URLs to full tiktoks
    increasing = df[df[item].isin(topX.index)].copy()

    # item_mins = df.groupby(item)[value].min()
    # increasing[value] = increasing.apply(
    #     lambda row: row[value] - item_mins[row[item]], 
    #     axis=1
    # )

    ax = sns.scatterplot(
        data=increasing,
        x=x,
        y=y,
        hue=item,
        legend=True,
        hue_order=topX.index
    )

    ax.xaxis.set_major_formatter(EngFormatter(places=0))
    ax.yaxis.set_major_formatter(EngFormatter(places=0))

    ax.set_xlabel(f'{x.title()}')
    ax.set_ylabel(f'{y.title()}')
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2))
    ax.set_title(f'{x.title()} and {y.title()} for {item.title()}s with greatest increase in `{x}`')
    if logx: ax.set_xscale('log')
    if logy: ax.set_yscale('log')

    
x_and_y_over_time(
    item='url',
    y='comments',
    x='likes',
    num_items=10,
    print_top=False,
)
plt.savefig('../report/img/likes_vs_comments.pdf', bbox_inches='tight')
plt.show()

In [None]:
x_and_y_over_time(
    item='creator_url',
    y='comments',
    x='likes',
    num_items=10,
    print_top=False,
#     logx=True,
#     logy=True,
)

## Question: Does the like:comment ratio depend on the creator?

- Does the like-comment ratio depend on the creator?
- Do some creators _really_ like using particular audios?
- Does the number of videos posted per hour impact how often they are shown to the bot?
- Does the number of videos posted per hour impact how well they do?
- What's the correlation between post frequency and virality?

## Per audio
- How to spot trending audio?

- Number of unique tiktok-audio combinations over time

- Get cumulative {likes,comments,uses} of all audio over time

## Get cumulative metrics
- Cumulative number of likes/comments/unique videos per audio over time

### Visualise the cumulative values over time

In [None]:
def cumul_over_time(
    item,
    value,
    num_items=10,
    print_top=False,
):
    topX = df[item].value_counts().sort_values(ascending=False).head(num_items)
    if print_top: print(topX)

    ax = sns.lineplot(
        data=df[df[item].isin(topX.index)],
        y=value,
        x='scraped_dt',
        hue=item,
        legend=True,
        hue_order=topX.index,
    )

    ax.yaxis.set_major_formatter(EngFormatter(places=1))
    ax.set_xlabel('Date')
    ax.set_ylabel(f'{value}')
#     ax.set_yscale('log')
    ax.set_xticks(pd.date_range(start=df.scraped_dt.min(), end=df.scraped_dt.max()))
    ax.tick_params(axis='x', rotation=90)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.4))
    ax.set_title(f'{value} for the top {num_items} most frequent {item}s')
    

### Cumulative count, likes, and comments over time

In [None]:
cumul_over_time('audio', 'cumul_count')
plt.savefig('../report/img/cumul_count_audio.pdf', bbox_inches='tight')
plt.show()
cumul_over_time('audio', 'cumul_likes')
plt.savefig('../report/img/cumul_likes_audio.pdf', bbox_inches='tight')
plt.show()
cumul_over_time('audio', 'cumul_comments')
plt.savefig('../report/img/cumul_comments_audio.pdf', bbox_inches='tight')
plt.show()

#### Scatterplot of likes per audio over time

In [None]:
df['likes_avg'] = df.groupby(['scraped_dt', 'audio'])['likes'].transform(np.mean)
df['likes_max'] = df.groupby(['scraped_dt', 'audio'])['likes'].transform(np.max)

topX = df['audio'].value_counts().head(10)

ax = sns.lineplot(
    data=df[df['audio'].isin(topX.index)],
    x='scraped_dt',
    y='likes_max',
    hue='audio',
    hue_order=topX.index,
    legend=True,
    size=1
)
ax.yaxis.set_major_formatter(EngFormatter(places=0))
ax.set_xticks(pd.date_range(start=df.scraped_dt.min(), end=df.scraped_dt.max()))
ax.tick_params(axis='x', rotation=90)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.4))

ax.set_xlabel('Date')
ax.set_ylabel(f'Likes')
ax.set_title(f'Number of likes per audio over time')

plt.show()
