### Hypothesis: Accounts with a higher proportion of power users have better overall conversion rates.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
# Merged data:
merged_df = pd.read_csv("../data/merged_df.csv")

In [5]:
# Rougly a third of the data without power users -> Remove:
merged_df = merged_df[merged_df["# Power Users"].notnull()]


In [None]:
# Calulate Power Proportion:
merged_df["Power Proportion"] = merged_df["# Power Users"]/merged_df["# Purchased Seats"]

In [None]:
# Cleaning Conversion Rate:
cleaned_series = merged_df["Conversion Rate Last 30 Days"].str.replace('%', '', regex=False)
merged_df["Conversion Rate Last 30 Days"] = cleaned_series.astype(float)

In [None]:

fig, ax = plt.subplots(figsize=(10, 6))

# Use regplot for a scatter plot that automatically includes a regression line (trend)
sns.regplot(
    data=merged_df,
    x="# Power Users",
    y="Conversion Rate Last 30 Days",
    scatter_kws={'alpha':0.7, 's':100}, # Customization for points
    line_kws={'color':'red'},           # Customization for the trend line
    ax=ax
)

ax.set_title("Relationship between Power Users and Conversion Rate")
ax.set_xlabel("# Power Users")
ax.set_ylabel("Conversion Rate (%)")
plt.tight_layout()

save_directory = os.path.join("..", "figs") 
os.makedirs(save_directory, exist_ok=True)
filename = "relationship_between_power_users_and_conversion_rate_scatter.png"
save_path = os.path.join(save_directory, filename)

fig.savefig(save_path)

### Oops this is because we are looking at the user level!

In [13]:
## Try again but looking at the average across accounts:

In [None]:
account_avg_conversion_rate_df = merged_df.groupby("Account ID").agg({"Conversion Rate Last 30 Days": np.mean})\
                                          .reset_index()

In [50]:
account_power_df = account_avg_conversion_rate_df.merge(merged_df.loc[:, ["Account ID", "# Power Users", "Power Proportion"]], on="Account ID", how="inner")\
                                                 .drop_duplicates("Account ID")

In [None]:

fig, ax = plt.subplots(figsize=(10, 6))

# Use regplot for a scatter plot that automatically includes a regression line (trend)
sns.regplot(
    data=account_power_df,
    x="# Power Users",
    y="Conversion Rate Last 30 Days",
    scatter_kws={'alpha':0.7, 's':100}, # Customization for points
    line_kws={'color':'red'},           # Customization for the trend line
    ax=ax
)

ax.set_title("Relationship between Power Users and Account Avg Conversion Rate")
ax.set_xlabel("# Power Users")
ax.set_ylabel("Account Avg Conversion Rate (%)")
plt.tight_layout()

save_directory = os.path.join("..", "figs") 
os.makedirs(save_directory, exist_ok=True)
filename = "relationship_between_power_users_and_account_avg_conversion_rate_scatter.png"
save_path = os.path.join(save_directory, filename)

fig.savefig(save_path)

In [None]:

fig, ax = plt.subplots(figsize=(10, 6))

# Use regplot for a scatter plot that automatically includes a regression line (trend)
sns.regplot(
    data=account_power_df,
    x="Power Proportion",
    y="Conversion Rate Last 30 Days",
    scatter_kws={'alpha':0.7, 's':100}, # Customization for points
    line_kws={'color':'red'},           # Customization for the trend line
    ax=ax
)

ax.set_title("Relationship between Power Proportion and Account Avg Conversion Rate")
ax.set_xlabel("Power Proportion")
ax.set_ylabel("Account Avg Conversion Rate (%)")
plt.tight_layout()

save_directory = os.path.join("..", "figs") 
os.makedirs(save_directory, exist_ok=True)
filename = "relationship_between_power_proportion_and_account_avg_conversion_rate_scatter.png"
save_path = os.path.join(save_directory, filename)

fig.savefig(save_path)

In [60]:
### Let's Try Binning to see if we can find an optimal Power Number:

In [70]:
bins = [-0.5, 0.5, 3.5, 8.5, 15.5, np.inf]
labels = ["0", "1-3", "4-8", "9-15", "16+"]

# Create the new binned column 'Power_User_Bin'
account_power_df["Power Users Bin"] = pd.cut(
    account_power_df["# Power Users"], 
    bins=bins, 
    labels=labels, 
    right=True, 
    include_lowest=True
)

In [None]:
# 3. Generate the Bar Plot
fig, ax = plt.subplots(figsize=(8, 6))

sns.barplot(
    data=account_power_df,
    x="Power Users Bin",
    y="Conversion Rate Last 30 Days",
    ax=ax,
    palette="viridis"
)

# Add context line
overall_mean = account_power_df["Conversion Rate Last 30 Days"].mean()
ax.axhline(overall_mean, color='red', linestyle='--', linewidth=1, label=f"Overall Mean ({overall_mean:.2f}%)")
ax.legend()

ax.set_title("Mean Conversion Rate by Power User Bin")
ax.set_xlabel("# Power Users Bin")
ax.set_ylabel("Mean Conversion Rate (%)")
plt.tight_layout()

save_directory = os.path.join("..", "figs") 
os.makedirs(save_directory, exist_ok=True)
filename = "power_user_bins_bar_plot.png"
save_path = os.path.join(save_directory, filename)

fig.savefig(save_path)