In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Call Libraries
import scipy
import numpy as np
import matplotlib as plt
import pandas as pd
import sklearn

In [4]:
import os

# Define the base path to data
base_path = '/content/drive/MyDrive/SoundCloud'

# Read each file into its own DataFrame
affiliations = pd.read_stata(os.path.join(base_path, '12affiliations_sample.dta'))
comments = pd.read_stata(os.path.join(base_path, '12comments_sample.dta'))
favoritings = pd.read_stata(os.path.join(base_path, '12favoritings_sample.dta'))
messages = pd.read_stata(os.path.join(base_path, '12messages_sample.dta'))
reposts = pd.read_stata(os.path.join(base_path, '12reposts_sample.dta'))

In [9]:
users = pd.read_csv(os.path.join(base_path, 'user_ids.csv'))
users.head(20)

Unnamed: 0.1,Unnamed: 0,user_id,type,created_at
0,1,54849,creator,2009-01-01 00:47:08
1,2,54869,creator,2009-01-01 05:24:26
2,3,54882,creator,2009-01-01 09:28:54
3,4,54901,creator,2009-01-01 13:10:30
4,5,54912,creator,2009-01-01 14:20:57
5,6,54928,,2009-01-01 15:27:56
6,7,54955,,2009-01-01 17:25:34
7,8,54958,creator,2009-01-01 17:34:22
8,9,54969,,2009-01-01 18:15:14
9,10,54994,creator,2009-01-01 19:22:19


In [11]:
creators_df = users[users['type'] == 'creator'].copy()
display(creators_df.head())
# Extract the creator IDs as a list or Series for use in functions
creators = creators_df['user_id'].unique()

Unnamed: 0.1,Unnamed: 0,user_id,type,created_at
0,1,54849,creator,2009-01-01 00:47:08
1,2,54869,creator,2009-01-01 05:24:26
2,3,54882,creator,2009-01-01 09:28:54
3,4,54901,creator,2009-01-01 13:10:30
4,5,54912,creator,2009-01-01 14:20:57


In [12]:
# Peek at Data
display(affiliations.head())
display(comments.head())
display(favoritings.head())
display(messages.head())
display(reposts.head())
display(creators_df.head())

Unnamed: 0,fan_id,contact_id,created_at
0,55250,45724,2009-01-02 15:14:42
1,55250,2629,2009-01-02 15:15:47
2,55250,4230,2009-01-02 15:15:56
3,55250,3082,2009-01-02 15:16:05
4,55250,3107,2009-01-02 15:16:13


Unnamed: 0,user_id,track_id,owner_id,created_at
0,56125,4912,274,2009-01-06 11:57:50
1,54181,116480,61390,2009-01-27 02:45:05
2,3983,125334,64864,2009-02-08 18:04:24
3,65739,140931,65739,2009-02-11 21:36:27
4,65739,140931,65739,2009-02-11 21:40:18


Unnamed: 0,user_id,track_id,owner_id,created_at
0,55806,98214,54084,2009-01-03 23:47:10
1,56588,100073,2153,2009-01-07 18:14:02
2,57524,103677,57276,2009-01-08 00:18:19
3,58048,58062,29763,2009-01-11 18:06:08
4,60584,55187,31543,2009-01-16 18:16:40


Unnamed: 0,sender_id,receiver_id,created_at
0,51272,55102,2009-01-02 22:43:33
1,54878,7915,2009-01-02 20:16:08
2,54878,7915,2009-01-04 21:58:29
3,54878,7915,2009-01-04 22:08:09
4,55013,52969,2009-01-03 03:35:42


Unnamed: 0,reposter_id,created_at,song_id,owner_id
0,194980,2012-06-07 11:52:27,48909076,80522
1,503949,2012-06-16 07:05:33,48718310,70917
2,194980,2012-06-17 16:01:50,49252926,80522
3,7564356,2012-07-02 21:44:03,7510652,84543
4,1762257,2012-07-04 16:22:40,43089879,61522


Unnamed: 0.1,Unnamed: 0,user_id,type,created_at
0,1,54849,creator,2009-01-01 00:47:08
1,2,54869,creator,2009-01-01 05:24:26
2,3,54882,creator,2009-01-01 09:28:54
3,4,54901,creator,2009-01-01 13:10:30
4,5,54912,creator,2009-01-01 14:20:57


#### **Define Parameters**
- Threshold = Cut-off for number of followers
- Window-Days = Days since influencer crosses threshold
- Top_K = Number of creators 'k' to be viewed

In [13]:
THRESHOLD = 100
WINDOW_DAYS = 180
TOP_K = 10

#### **Sorting Affiliations by timestamps**
- Clean the affiliations (follow) dataset.
- Keep only fan_id, contact_id, and created_at.
- Convert timestamps to datetime.
- Coerces IDs to numeric, remove missing values, sort by time, and drop duplicate follow edges.

**Output:** Time-ordered follower-following relationships.

In [14]:
def prep_affiliations(affiliations):
    a = affiliations[["fan_id", "contact_id", "created_at"]].copy()
    a["created_at"] = pd.to_datetime(a["created_at"], errors="coerce")
    a = a.dropna(subset=["fan_id", "contact_id", "created_at"])
    a["fan_id"] = pd.to_numeric(a["fan_id"], errors="coerce")
    a["contact_id"] = pd.to_numeric(a["contact_id"], errors="coerce")
    a = a.dropna(subset=["fan_id", "contact_id"])
    a = a.sort_values("created_at")
    a = a.drop_duplicates(subset=["fan_id", "contact_id"], keep="first")
    return a

display(prep_affiliations(affiliations))

Unnamed: 0,fan_id,contact_id,created_at
38480,54848,14730,2009-01-01 01:01:04
39220,54846,4610,2009-01-01 02:12:39
39221,54846,6281,2009-01-01 02:13:21
39222,54846,15386,2009-01-01 02:17:50
39223,54846,1214,2009-01-01 02:25:23
...,...,...,...
30493729,81303,83118585,2014-03-19 02:24:29
30673499,85286217,87783,2014-03-19 02:24:30
30673500,85286217,60663,2014-03-19 02:24:31
29822351,9348702,75620,2014-03-19 02:24:35


#### **Time-ordered list of creators**
- Computes when each creator reaches a follower threshold.
- Filters follow events to creators.
- Sorts follower arrivals chronologically.
- Assigns cumulative follower ranks per creator.
- Selects the timestamp when the rank equals the predefined threshold.

**Ouput:** Series mapping each creator to their threshold-crossing time.

In [15]:
def compute_tcross(a, creators):
    fe = a[a["contact_id"].isin(creators)].copy()
    fe = fe.sort_values(["contact_id", "created_at"])
    # Groups the DataFrame fe by each unique contact_id = applied independently to each creator)
    # Since cumcount() starts from 0, adding 1 makes the rank start from 1
    fe["rank"] = fe.groupby("contact_id").cumcount() + 1

    tcross = (
        fe[fe["rank"] == THRESHOLD]
        .set_index("contact_id")["created_at"]
    )
    tcross.name = "tcross"
    return tcross

display(compute_tcross(affiliations, creators))

Unnamed: 0_level_0,tcross
contact_id,Unnamed: 1_level_1
54846,2013-10-03 19:47:05
54847,2011-02-21 21:29:14
54861,2010-11-11 22:03:31
54866,2012-06-08 02:03:49
54869,2013-10-16 21:07:56
...,...
93702,2011-09-28 20:08:51
93703,2009-12-13 05:19:28
93705,2010-07-01 08:53:52
93712,2010-09-14 15:14:38


#### **Compute Reciprocal Ties**
- Identifies reciprocal (mutual) follow ties among creators.
- Extract directed follow edges
- Filter them to creator–creator relationships
- Perform a *self-merge* with reversed edges to retain only pairs where both users follow each other.

**Output:** Produce a clean set of reciprocal strong-tie candidates.

In [None]:
def reciprocal_ties(a, creators):
    # Extracts directed follow edges
    edges = a[["fan_id", "contact_id"]].copy()
    # Renames as a directed edge u → v
    edges = edges.rename(columns={"fan_id": "u", "contact_id": "v"})
    # Filter analysis to keep creator–creator relationships
    edges = edges[
        edges["u"].isin(creators) &
        edges["v"].isin(creators)
    ]

    # Strong ties: Match each edge with its reversed counterpart (v → u)
    recip = edges.merge(
        edges.rename(columns={"u": "v", "v": "u"}),
        on=["u", "v"],
        how="inner"
    )

    return recip.drop_duplicates()

display(reciprocal_ties(affiliations, creators))

####**Sort, clean, and merge comments and  datasets**
- Convert timestamps, remove missing or invalid IDs.
- Excludes self-interactions.
- Rename columns to a consistent source–target format.
- Concatenate both datasets.

**Output:** Single time-ordered interaction table for tie-intensity analysis.

In [None]:
def build_interactions(comments, favoritings):
    def clean(df):
        df = df[["user_id", "owner_id", "created_at"]].copy()
        df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
        df = df.dropna(subset=["user_id", "owner_id", "created_at"])
        df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce")
        df["owner_id"] = pd.to_numeric(df["owner_id"], errors="coerce")
        df = df.dropna(subset=["user_id", "owner_id"])
        # Remove self-interactions
        df = df[df["user_id"] != df["owner_id"]]
        df = df.rename(columns={
            "user_id": "source",
            "owner_id": "target",
            "created_at": "time"
        })
        return df

    c = clean(comments)
    f = clean(favoritings)

    interactions = pd.concat([c, f], ignore_index=True)
    return interactions.sort_values(["source", "time"])

display(build_interactions(comments, favoritings))

In [None]:
def intensity_pre_post(interactions, recip, tcross):

    # Keep only creators who crossed threshold
    crossed = tcross.index
    recip = recip[recip["u"].isin(crossed)].copy()

    # Attach tcross to edges
    recip = recip.merge(
        tcross.reset_index(),
        left_on="u",
        right_on="contact_id",
        how="inner"
    ).rename(columns={"contact_id": "creator_id"})

    # Use dayfirst=True to handle the DD-MM-YYYY format in the data
    recip["tcross"] = pd.to_datetime(recip["tcross"], dayfirst=True)
    recip["post_end"] = recip["tcross"] + pd.Timedelta(days=WINDOW_DAYS)

    # Match interactions along reciprocal edges
    ev = interactions.merge(
        recip[["u", "v", "tcross", "post_end"]],
        left_on=["source", "target"],
        right_on=["u", "v"],
        how="inner"
    )

    # Ensure interaction times are also datetime for comparison
    ev["time"] = pd.to_datetime(ev["time"], dayfirst=True)

    pre = ev["time"] < ev["tcross"]
    post = (ev["time"] >= ev["tcross"]) & (ev["time"] <= ev["post_end"])

    ev = ev[pre | post].copy()
    ev["window"] = np.where(pre.loc[ev.index], "pre", "post")

    pre_counts = (
        ev[ev["window"] == "pre"]
        .groupby(["u", "v"])
        .size()
        .rename("intensity_pre")
    )

    post_counts = (
        ev[ev["window"] == "post"]
        .groupby(["u", "v"])
        .size()
        .rename("intensity_post")
    )

    out = pd.concat([pre_counts, post_counts], axis=1).fillna(0).reset_index()

    return out

display(intensity_pre_post(build_interactions(comments, favoritings), reciprocal_ties(affiliations, creators), compute_tcross(affiliations, creators)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the intensity data
interactions_df = build_interactions(comments, favoritings)
recip_df = reciprocal_ties(affiliations, creators)
tcross_df = compute_tcross(affiliations, creators)
intensity_data = intensity_pre_post(interactions_df, recip_df, tcross_df)

# Create the visualization
plt.figure(figsize=(14, 6))

# Subplot 1: Scatter plot of Pre vs Post intensity
plt.subplot(1, 2, 1)
sns.scatterplot(data=intensity_data, x='intensity_pre', y='intensity_post', alpha=0.6)
plt.plot([0, intensity_data['intensity_pre'].max()], [0, intensity_data['intensity_pre'].max()], 'r--', label='Equal Intensity')
plt.title('Comparison: Pre vs Post Intensity per Tie')
plt.xlabel('Intensity (Pre-Crossing)')
plt.ylabel('Intensity (Post-Crossing)')
plt.legend()

# Subplot 2: Distribution of changes
plt.subplot(1, 2, 2)
intensity_data['change'] = intensity_data['intensity_post'] - intensity_data['intensity_pre']
sns.histplot(intensity_data['change'], bins=30, kde=True, color='purple')
plt.axvline(0, color='red', linestyle='--')
plt.title('Distribution of Intensity Change (Post - Pre)')
plt.xlabel('Change in Number of Interactions')

plt.tight_layout()
plt.show()

In [None]:
def tie_metrics(intensity_uv):
    results = []
    for u, g in intensity_uv.groupby("u"):
        g = g.copy()
        pre_top = g.sort_values("intensity_pre", ascending=False).head(TOP_K)["v"]
        post_top = g.sort_values("intensity_post", ascending=False).head(TOP_K)["v"]
        overlap = len(set(pre_top).intersection(set(post_top))) / min(TOP_K, len(pre_top)) if len(pre_top) > 0 else 0
        rho = np.nan
        if len(g) >= 2:
            rho = g["intensity_pre"].corr(g["intensity_post"], method="spearman")
        results.append({
            "creator_id": u,
            "n_partners": len(g),
            "topk_overlap": overlap,
            "spearman_rank_corr": rho,
            "total_pre": g["intensity_pre"].sum(),
            "total_post": g["intensity_post"].sum()
        })
    return pd.DataFrame(results)

# Aggregate by creator to see overall impact
metrics = tie_metrics(intensity_data)
top_creators = metrics.sort_values('total_post', ascending=False).head(10)

plt.figure(figsize=(12, 6))
top_melted = top_creators.melt(id_vars='creator_id', value_vars=['total_pre', 'total_post'],
                              var_name='Period', value_name='Total Interactions')

sns.barplot(data=top_melted, x='creator_id', y='Total Interactions', hue='Period')
plt.title('Total Interactions Pre vs Post for Top 10 Creators')
plt.xticks(rotation=45)
plt.show()

####**Partner Switching**

In [None]:
def partner_switching_metrics(interactions, tcross, min_pre_partners=0):

    # Keep only creators who crossed threshold
    crossed = tcross.index
    ev = interactions[interactions["source"].isin(crossed)].copy()

    # Attach creator’s tcross to their interaction rows
    ev = ev.merge(
        tcross.reset_index(),
        left_on="source",
        right_on="contact_id",
        how="inner"
    )

    # Parse interaction and crossing timestamps
    ev["time"] = pd.to_datetime(ev["time"], dayfirst=True)
    ev["tcross"] = pd.to_datetime(ev["tcross"], dayfirst=True)

    # Label each interaction as pre/post crossing.
    ev["window"] = np.where(ev["time"] < ev["tcross"], "pre", "post")

    results = []

    for u, g in ev.groupby("source"):

        pre_partners = set(g[g["window"] == "pre"]["target"])
        post_partners = set(g[g["window"] == "post"]["target"])

        if len(pre_partners) < min_pre_partners:
            continue

        # Forms all partners
        union = pre_partners.union(post_partners)
        # Forms repeated partners
        intersection = pre_partners.intersection(post_partners)

        # Stability: overlap divided by union
        jaccard = len(intersection) / len(union) if len(union) > 0 else np.nan

        # Switching: new post-only partners divided by post partners
        new_share = len(post_partners - pre_partners) / len(post_partners) if len(post_partners) > 0 else np.nan

        results.append({
            "creator_id": u,
            "n_pre_partners": len(pre_partners),
            "n_post_partners": len(post_partners),
            "jaccard_similarity": jaccard,
            "new_partner_share": new_share
        })

    return pd.DataFrame(results)

display(partner_switching_metrics(build_interactions(comments, favoritings), compute_tcross(affiliations, creators)))

In [None]:
switch_metrics = partner_switching_metrics(
    interactions=build_interactions(comments, favoritings),
    tcross=compute_tcross(prep_affiliations(affiliations), creators),
    min_pre_partners=3   # optional robustness filter
)

switch_metrics.head()

Plot interpretation:
1. **Jaccard Similarity** = ∣ Pre ∩ Post ∣ / ∣ Pre ∪ Post ∣
    - 1 → same partners
    - 0 → completely different partners

After crossing the follower (n = 100) threshold:
  - Creators are interacting with largely different creators.
  - Partner switching is strong.
  - Status transition → reallocation of social attention

2. **New Partner Share** = | Post - Pre | ​/ | Post |
    - 1 → all post partners are new
    - 0 → no new partners

After crossing the follower (n = 100) threshold:
- Creators overwhelmingly interact with new creators.
- Post-threshold interactions are largely not with prior partners.
- This strongly supports a status-driven social expansion or redirection effect.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.histplot(switch_metrics["jaccard_similarity"], bins=30, kde=True)
plt.title("Distribution of Partner Stability (Jaccard)")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(switch_metrics["new_partner_share"], bins=30, kde=True)
plt.title("Distribution of New Partner Share")
plt.show()

In [None]:
switch_metrics["n_pre_partners"].describe()