In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import numpy as np

In [2]:
PLOT_PARAMS = {
    'width': 400,
    'height': 200,
    'margin': dict(l=50, r=50, t=50, b=50),
    'title_font_size': 14
}

figfolder = "docs/"
export_to_gdoc = False

def get_text_color():
    return 'black' if export_to_gdoc else 'white'

def create_rating_distribution(df, medicine):
    """Create a bar plot for rating distribution of a single medicine."""
    bins = [1.0, 2.0, 3.0, 4.0, 5.0, 5.1]
    counts, _ = np.histogram(df[df['Medicine'] == medicine]['Overall Rating'], bins=bins)
    
    fig = go.Figure(data=[
        go.Bar(
            x=[1, 2, 3, 4, 5],
            y=counts,
            width=0.8,
            hovertemplate=(
                "Overall Rating: %{customdata}<br>" +
                "Count: %{y}" +
                "<extra></extra>"
            ),
            customdata=["1-1.9", "2-2.9", "3-3.9", "4-4.9", "5"]
        )
    ])

    fig.update_layout(
        title=f'Distribution of Ratings: {medicine}',
        xaxis_title='Rating',
        yaxis_title='Count',
        template='plotly_white',
        width=PLOT_PARAMS['width'],
        height=PLOT_PARAMS['height'],
        bargap=0.1,
        margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(
            tickmode='array',
            tickvals=[1, 2, 3, 4, 5],
            ticktext=['1', '2', '3', '4', '5'],
            range=[0.5, 5.5]
        )
    )
    
    return fig

In [4]:
def create_grouped_rating_distribution(df):
    """Create a grouped bar plot comparing rating distributions across medicines."""
    all_data = []
    for medicine in sorted(df['Medicine'].unique()):
        medicine_data = df[df['Medicine'] == medicine]
        bins = [1.0, 2.0, 3.0, 4.0, 5.0, 5.1]
        counts, _ = np.histogram(medicine_data['Overall Rating'], bins=bins)
        
        percentages = (counts / len(medicine_data)) * 100
        ratings_df = pd.DataFrame({
            'Rating': [1, 2, 3, 4, 5],
            'Percentage': percentages,
            'Medicine': medicine,
            'RatingRange': ['[1,2)', '[2,3)', '[3,4)', '[4,5)', '5']
        })
        all_data.append(ratings_df)

    plot_df = pd.concat(all_data)
    
    fig = px.bar(plot_df, 
                 x='Rating', 
                 y='Percentage',
                 color='Medicine',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height']* 2.0,)

    fig.update_layout(
        title='Distribution of ratings by product (percentage)',
        xaxis_title='Rating',
        yaxis_title='Percentage of Reviews',
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(
            tickmode='array',
            tickvals=[1, 2, 3, 4, 5],
            ticktext=['[1,2)', '[2,3)', '[3,4)', '[4,5)', '5'],
            range=[0.5, 5.5]
        ),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-0.25,
            xanchor='center',
            x=0.5,
            title=None
        ),
        # autosize=True,
    )

    for medicine_name in plot_df['Medicine'].unique():
        medicine_data = plot_df[plot_df['Medicine'] == medicine_name]
        fig.update_traces(
            customdata=medicine_data[['Medicine', 'RatingRange']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Rating: %{customdata[1]}<br>" +
                "Percentage: %{y:.1f}%" +
                "<extra></extra>"
            ),
            selector=dict(name=medicine_name)
        )
    
    return fig

In [5]:
def get_chanca_piedra_weights(df):
    """
    Calculate weights for Chanca piedra reviews based on the true vs scraped distribution.
    Returns a Series with the weight for each review.
    """
    weights = pd.Series(1.0, index=df.index)
 
    chanca_piedra_corrections = {
    "NaturalisimoLife Chanca Piedra 1600 mg": {
        "scraped": [82, 18, 37, 103, 146],  # 1-5 stars
        "actual": [82, 18, 38, 103, 906]
    },
    "EU Natural: \"Stone Breaker\" chanca piedra": {
        "scraped": [102, 44, 50, 100, 138],  # 1-5 stars
        "actual": [123, 44, 50, 131, 918]
    }
}
    for brand, data in chanca_piedra_corrections.items():
        # Create mask for this specific brand
        mask = (df['Medicine'] == 'Chanca piedra') & (df['Source'] == brand)
        brand_reviews = df[mask]
        
        if len(brand_reviews) > 0:
            # Create weight mapping for each star rating
            for i in range(5):
                scraped = data['scraped'][i]
                actual = data['actual'][i]
                if scraped > 0:  # Avoid division by zero
                    weight = actual / scraped
                    # Get indices where both conditions are true
                    indices = df.index[mask & (df['Stars'] == (i + 1))]
                    # Update weights for these indices
                    weights.loc[indices] = weight
    
    return weights

In [11]:
def calculate_weighted_stats(df, columns_to_analyze, medicines_to_exclude=None, medicine_mapping=None, all_medicines=None, is_amazon_data=False):
    """Calculate statistics for the given dataframe with configurable exclusions and mappings."""
    if medicines_to_exclude is None:
        medicines_to_exclude = []
    if medicine_mapping is None:
        medicine_mapping = {}
    
    # Get weights for Chanca piedra reviews
    weights = get_chanca_piedra_weights(df) if is_amazon_data else pd.Series(1.0, index=df.index)
    
    # Modify the groupby aggregation to use weights
    summary = df.groupby('Medicine').agg(
        total_reviews_weighted=('Medicine', lambda x: weights[x.index].sum()),  # Weighted count
        total_reviews=('Medicine', 'size'),  # Actual count
        **{
            f'{col}_positive': (col, lambda x: sum(
                weights[x.index[i]] for i, val in enumerate(x) 
                if val == 1 or (isinstance(val, str) and val.strip())
            ))
            for col in columns_to_analyze
        },
        **{
            f'{col}_negative': (col, lambda x: sum(
                weights[x.index[i]] for i, val in enumerate(x) 
                if val == 0
            ))
            for col in columns_to_analyze
        }
    ).reset_index()
    
    # Create a DataFrame with all medicines and zero counts
    full_summary = pd.DataFrame({'Medicine': all_medicines or sorted(df['Medicine'].unique())})
    full_summary = full_summary[~full_summary['Medicine'].isin(medicines_to_exclude)]
    
    # Merge with the actual data, filling missing values selectively
    summary = pd.merge(
        full_summary,
        summary,
        on='Medicine',
        how='left'
    )
    
    # Fill NaN values with 0, but preserve total_reviews as integer
    summary['total_reviews'] = summary['total_reviews'].fillna(0).astype(int)
    summary['total_reviews_weighted'] = summary['total_reviews_weighted'].fillna(0).astype(float)
    for col in columns_to_analyze:
        summary[f'{col}_positive'] = summary[f'{col}_positive'].fillna(0)
        summary[f'{col}_negative'] = summary[f'{col}_negative'].fillna(0)
    
    for col in columns_to_analyze:
        # Use weighted totals for percentage calculations
        denominator = summary.apply(
            lambda row: row['total_reviews_weighted'] if row['Medicine'] == 'Chanca piedra' else row['total_reviews'],
            axis=1
        )
        
        # Handle division by zero for percentage calculations
        summary[f'{col}_positive_pct'] = (
            summary[f'{col}_positive'] / denominator * 100
        ).fillna(0).round(1)
        summary[f'{col}_negative_pct'] = (
            summary[f'{col}_negative'] / denominator * 100
        ).fillna(0).round(1)
        summary[f'{col}_no_info_pct'] = (
            (denominator - summary[f'{col}_positive'] - summary[f'{col}_negative']) 
            / denominator * 100
        ).fillna(0).round(1)
    
    transposed_data = {
        ' ': ['<b>Total Reviews</b>'] + 
            sum([[f'<b>{col}</b>', 
                 '    Yes', 
                 '    No', 
                 '    No information'] 
                for col in columns_to_analyze], []),
        **{medicine_mapping.get(row['Medicine'], row['Medicine']): [
            int(row['total_reviews'] if row['Medicine'] != 'Chanca piedra' else row['total_reviews'])] + 
            sum([[
                '',
                row[f'{col}_positive_pct'],
                row[f'{col}_negative_pct'],
                row[f'{col}_no_info_pct']
            ] for col in columns_to_analyze], [])
           for _, row in summary.iterrows()}
    }
    
    display_df = pd.DataFrame(transposed_data).set_index(' ')
    return display_df

In [65]:
def create_table(dfa, columns_to_analyze, medicines_to_exclude, medicine_mapping, filters=None, high_quality_reviews_only=False, is_amazon_data=False):
    """
    Create table (with weighted statistics for Amazon reviews).
    """
    
    # Store the complete list of medicines before filtering
    all_medicines = sorted(dfa['Medicine'].unique())
    
    # Apply filters
    if filters is not None:
        for column, value in filters.items():
            dfa = dfa[dfa[column] == value]
    elif high_quality_reviews_only:
        dfa = dfa[dfa['Super high quality'] == 1]
    
    stats_df = calculate_weighted_stats(
        dfa,
        columns_to_analyze,
        medicines_to_exclude=medicines_to_exclude,
        medicine_mapping=medicine_mapping,
        all_medicines=all_medicines,
        is_amazon_data=is_amazon_data
    )
    
    return stats_df

In [66]:
def style_dataframe(df):
    """Apply consistent styling to the dataframe."""
    return df.style\
        .set_table_styles([
            {'selector': '',
             'props': [('color', get_text_color())]},
        ])\
        .format(lambda x: f'{x:,.0f}' if isinstance(x, (int, float)) and str(x).isdigit() 
                else (f'{x:.0f}%' if isinstance(x, float) and x.is_integer() 
                else f'{x:.1f}%' if isinstance(x, float) 
                else x))

In [90]:
columns_to_analyze = [
    'Helps overall with kidney stones',
    'Side effects mentioned',
    'Works as a prophylactic',
    'Asserts significant pain reduction',
    'Mentions breaking of stones',
    'Mentions shrinking of the stones',
    'Mentions softening of stones',
    'Stone passed with no or almost no pain',
    'Helps overall with gallstones',
]

table_cases = {
    'All Reviews': "",
    'Super high quality': "-high-quality",
    'Says that has suffered from condition for a long time (>1 year)': "-long-time",
    'Someone who makes large amounts of stones (>10 total)': "-stoner"
}

# 1. WebMD Reviews Analysis

In [91]:
dfw = pd.read_csv('csv-files/Kidney Stone Reviews - Reviews - WebMD.csv')
print(f"Successfully loaded {len(dfw)} rows of data")

Successfully loaded 1567 rows of data


In [92]:
# Create grouped rating distribution
fig = create_grouped_rating_distribution(dfw)
fig.show()
figname = "webmd-treatment-distribution-rating"
# fig.write_html(figfolder+figname+".html")
# fig.write_image(figfolder+figname+".png", scale=4)

In [100]:
webmd_medicine_mapping = {
    'Hydrochlorothiazide': 'HCTZ',
    'Potassium Citrate': 'Potassium citrate'
}

webmd_medicines_to_exclude = ['Ashwagandha', 'Melatonin']

# Generate and display statistics
for key, val in table_cases.items():
    if key == "All Reviews":
        filters = None
    else:
        filters = {key: 1}
    display_dfw = create_table(dfw, columns_to_analyze, webmd_medicines_to_exclude, webmd_medicine_mapping, filters=filters)
    tablename = "webmd-table-full-analysis" + val
    # display(style_dataframe(display_dfw))
    display_dfw.to_html(figfolder + tablename + ".html")

# 2. Amazon Reviews Analysis

In [94]:
dfa = pd.read_csv('csv-files/Kidney Stone Reviews - Reviews - Amazon.csv')
print(f"Successfully loaded {len(dfa)} rows of data")

Successfully loaded 1456 rows of data


In [99]:
amazon_medicine_mapping = {
    'Hydrochlorothiazide': 'HCTZ',
}

amazon_medicines_to_exclude = []

# Generate and display statistics
for key, val in table_cases.items():
    if key == "All Reviews":
        filters = None
    else:
        filters = {key: 1}
    display_dfa = create_table(dfa, columns_to_analyze, webmd_medicines_to_exclude, webmd_medicine_mapping, filters=filters, is_amazon_data=True)
    tablename = "amazon-table-full-analysis" + val
    # display(style_dataframe(display_dfa))
    display_dfa.to_html(figfolder + tablename + ".html")

# 3. Reddit Posts Analysis

In [96]:
dfr = pd.read_csv('csv-files/Kidney Stone Reviews - Reviews - Reddit.csv')
print(f"Successfully loaded {len(dfr)} rows of data")

Successfully loaded 2308 rows of data


In [None]:
reddit_medicine_mapping = {
    'Hydrochlorothiazide': 'HCTZ',
}

reddit_medicines_to_exclude = []

# Generate and display statistics
for key, val in table_cases.items():
    if key == "All Reviews":
        filters = None
    else:
        filters = {key: 1}
    display_dfr = create_table(dfr, columns_to_analyze, reddit_medicines_to_exclude, reddit_medicine_mapping, filters=filters)
    tablename = "reddit-table-full-analysis" + val
    # display(style_dataframe(display_dfr))
    display_dfr.to_html(figfolder + tablename + ".html")

Unnamed: 0,Allopurinol,Black seed,Chanca piedra,Flomax,Garcinia,HCTZ,Phosfood,Potassium citrate,Rowatinex
,,,,,,,,,
Total Reviews,49,17,492,1134,38,46,2,509,21
Helps overall with kidney stones,,,,,,,,,
Yes,20.4%,23.5%,35.2%,24.6%,42.1%,32.6%,100%,22%,23.8%
No,2%,5.9%,9.8%,11.8%,0%,4.3%,0%,8.6%,19%
No information,77.6%,70.6%,55.1%,63.6%,57.9%,63%,0%,69.4%,57.1%
Helps overall with gallstones,,,,,,,,,
Yes,0%,0%,0%,0%,0%,0%,0%,0%,0%
No,0%,0%,0%,0%,0%,0%,0%,0%,0%
No information,100%,100%,100%,100%,100%,100%,100%,100%,100%


Unnamed: 0,Allopurinol,Black seed,Chanca piedra,Flomax,Garcinia,HCTZ,Phosfood,Potassium citrate,Rowatinex
,,,,,,,,,
Total Reviews,23,5,154,489,7,7,0,235,6
Helps overall with kidney stones,,,,,,,,,
Yes,26.1%,40%,45.5%,26.2%,28.6%,14.3%,0%,28.1%,50%
No,4.3%,0%,11%,12.9%,0%,0%,0%,10.6%,16.7%
No information,69.6%,60%,43.5%,60.9%,71.4%,85.7%,0%,61.3%,33.3%
Helps overall with gallstones,,,,,,,,,
Yes,0%,0%,0%,0%,0%,0%,0%,0%,0%
No,0%,0%,0%,0%,0%,0%,0%,0%,0%
No information,100%,100%,100%,100%,100%,100%,0%,100%,100%


Unnamed: 0,Allopurinol,Black seed,Chanca piedra,Flomax,Garcinia,HCTZ,Phosfood,Potassium citrate,Rowatinex
,,,,,,,,,
Total Reviews,14,0,64,165,6,8,0,116,7
Helps overall with kidney stones,,,,,,,,,
Yes,42.9%,0%,57.8%,33.9%,83.3%,50%,0%,30.2%,28.6%
No,0%,0%,12.5%,16.4%,0%,0%,0%,19.8%,42.9%
No information,57.1%,0%,29.7%,49.7%,16.7%,50%,0%,50%,28.6%
Helps overall with gallstones,,,,,,,,,
Yes,0%,0%,0%,0%,0%,0%,0%,0%,0%
No,0%,0%,0%,0%,0%,0%,0%,0%,0%
No information,100%,0%,100%,100%,100%,100%,0%,100%,100%


Unnamed: 0,Allopurinol,Black seed,Chanca piedra,Flomax,Garcinia,HCTZ,Phosfood,Potassium citrate,Rowatinex
,,,,,,,,,
Total Reviews,11,2,38,143,7,3,0,103,4
Helps overall with kidney stones,,,,,,,,,
Yes,36.4%,50%,73.7%,44.1%,85.7%,66.7%,0%,26.2%,25%
No,0%,0%,5.3%,17.5%,0%,0%,0%,21.4%,25%
No information,63.6%,50%,21.1%,38.5%,14.3%,33.3%,0%,52.4%,50%
Helps overall with gallstones,,,,,,,,,
Yes,0%,0%,0%,0%,0%,0%,0%,0%,0%
No,0%,0%,0%,0%,0%,0%,0%,0%,0%
No information,100%,100%,100%,100%,100%,100%,0%,100%,100%


# 4. Side effects

In [77]:
def create_category_distribution(df, use_counts=False):
    """Create a grouped bar plot comparing side effect category distributions across medicines.
    
    Args:
        df: DataFrame with the side effects data
        use_counts: If True, show actual counts. If False, show percentages.
    """
    all_data = []
    for product in sorted(df['Product'].unique()):
        product_data = df[df['Product'] == product]
        category_counts = product_data['Category'].value_counts()
        
        if use_counts:
            values = category_counts.values
        else:
            values = (category_counts / len(product_data)) * 100
            
        category_df = pd.DataFrame({
            'Category': category_counts.index,
            'Value': values,
            'Product': product
        })
        all_data.append(category_df)

    plot_df = pd.concat(all_data)
    
    fig = px.bar(plot_df, 
                 x='Category', 
                 y='Value',
                 color='Product',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height'] * 2.0)

    # Determine title and axis labels based on the mode
    metric = "total count" if use_counts else "percentage"
    y_axis_label = "Number of Side Effects" if use_counts else "Percentage of Side Effects"
    
    fig.update_layout(
        title=dict(
            text=f'Distribution of side effect categories by product ({metric})',
            y=0.95,
            yanchor='bottom'
        ),
        xaxis_title= None,
        yaxis_title=y_axis_label,
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        # margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(tickangle=45),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-.7,
            xanchor='center',
            x=0.5,
            title=None
        )
    )

    for product_name in plot_df['Product'].unique():
        product_data = plot_df[plot_df['Product'] == product_name]
        hover_value = "Count: %{y}" if use_counts else "Percentage: %{y:.1f}%"
        fig.update_traces(
            customdata=product_data[['Product', 'Category']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Category: %{customdata[1]}<br>" +
                hover_value +
                "<extra></extra>"
            ),
            selector=dict(name=product_name)
        )
    
    return fig

def create_subcategory_distribution(df, category, use_counts=False):
    """Create a grouped bar plot for subcategories within a specific category.
    
    Args:
        df: DataFrame with the side effects data
        category: The main category to analyze
        use_counts: If True, show actual counts. If False, show percentages.
    """
    category_data = df[df['Category'] == category]
    
    all_data = []
    for product in sorted(category_data['Product'].unique()):
        product_data = category_data[category_data['Product'] == product]
        subcategory_counts = product_data['Subcategory'].value_counts()
        
        if use_counts:
            values = subcategory_counts.values
        else:
            # Calculate percentage relative to total side effects for the product
            values = (subcategory_counts / len(df[df['Product'] == product])) * 100
            
        subcategory_df = pd.DataFrame({
            'Subcategory': subcategory_counts.index,
            'Value': values,
            'Product': product
        })
        all_data.append(subcategory_df)

    plot_df = pd.concat(all_data)
    
    fig = px.bar(plot_df, 
                 x='Subcategory', 
                 y='Value',
                 color='Product',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height'] * 2.0)

    # Determine title and axis labels based on the mode
    metric = "total count" if use_counts else "percentage"
    y_axis_label = "Number of Side Effects" if use_counts else "Percentage of Side Effects"
    
    fig.update_layout(
        title=dict(
            text=f'Distribution of {category} subcategories by product ({metric})',
            y=0.95,
            yanchor='bottom'
        ),
        xaxis_title=None,
        yaxis_title=y_axis_label,
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        # margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(tickangle=45),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-0.7,
            xanchor='center',
            x=0.5,
            title=None
        )
    )

    for product_name in plot_df['Product'].unique():
        product_data = plot_df[plot_df['Product'] == product_name]
        hover_value = "Count: %{y}" if use_counts else "Percentage: %{y:.1f}%"
        fig.update_traces(
            customdata=product_data[['Product', 'Subcategory']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Subcategory: %{customdata[1]}<br>" +
                hover_value +
                "<extra></extra>"
            ),
            selector=dict(name=product_name)
        )
    
    return fig

def sanitize_filename(category):
    """Convert category name to filename-friendly format."""
    # Replace forward slashes and other special characters with hyphens
    sanitized = category.lower().replace('/', '-').replace(' ', '-')
    # Remove any other potentially problematic characters
    sanitized = ''.join(c for c in sanitized if c.isalnum() or c == '-')
    return sanitized

PLOT_PARAMS = {
    'width': 400,
    'height': 200,
    'margin': dict(l=50, r=50, t=50, b=50),
    'title_font_size': 14
}

# Usage:
df = pd.read_csv('csv-files/Kidney Stone Reviews - Summary - Side effects.csv')
# Create main category distribution
use_counts = True
metric = "total-count" if use_counts else "percentage"
fig_categories = create_category_distribution(df, use_counts=use_counts)
fig_categories.show()
# fig_categories.write_html(figfolder + "side-effects-category-distribution-" + metric + ".html")

for use_counts in [True, False]:
    metric = "total-count" if use_counts else "percentage"
    
    # Create and save main category distribution
    fig_categories = create_category_distribution(df, use_counts=use_counts)
    fig_categories.write_html(figfolder + f"side-effects-category-distribution-{metric}.html")
    
    # Get all unique categories and create plots for each
    categories = sorted(df['Category'].unique())
    
    for category in categories:
        # Create filename-friendly version of category name
        category_filename = sanitize_filename(category)
        
        # Create and save subcategory distribution
        fig_subcategories = create_subcategory_distribution(df, category, use_counts=use_counts)
        filename = f"side-effects-{category_filename}-subcategories-{metric}.html"
        # fig_subcategories.write_html(
        #     figfolder + f"side-effects-{category_filename}-subcategories-{metric}.html"
        # )

In [78]:
def create_severity_distribution(df):
    """Create a stacked bar plot showing severity distributions across medicines.
    
    Args:
        df: DataFrame with the side effects data
    """
    # Define adjustment factors for each product (target total percentages)
    adjustment_factors = {
        'Allopurinol': 20.97,
        'Chanca piedra': 3.93,
        'Flomax': 23.18,
        'Hydrochlorothiazide': 25.76,
        'Potassium citrate': 12.91,
        'Ashwagandha': 47.97,
        'Melatonin': 29.89,
        'Black seed': 21.28,
        'Garcinia': 47.10,
        'Phosfood': 7.14,
        'Rowatinex': 4.50
    }
    
    # Define total reviews for each product
    total_reviews = {
        'Allopurinol': 62,
        'Chanca piedra': 1772,
        'Flomax': 1156,
        'Hydrochlorothiazide': 66,
        'Potassium citrate': 666,
        'Ashwagandha': 271,
        'Melatonin': 161,
        'Black seed': 94,
        'Garcinia': 930,
        'Phosfood': 42,
        'Rowatinex': 111
    }
    
    # Create severity mapping
    severity_mapping = {
        1: 'Mild',
        2: 'Medium',
        3: 'Severe'
    }
    
    # Define color scheme (traffic light colors)
    color_scheme = {
        'Mild': '#4caf50',    # Green
        'Medium': '#ffc107',  # Amber/Yellow
        'Severe': '#f44336'   # Red
    }
    
    all_data = []
    for product in sorted(df['Product'].unique()):
        product_data = df[df['Product'] == product]
        severity_counts = product_data['Severity'].value_counts()
        
        # Calculate relative proportions within each product
        proportions = severity_counts / len(product_data)
        
        # Scale the proportions to match the target total percentage
        target_total = adjustment_factors[product]
        values = proportions * target_total
            
        severity_df = pd.DataFrame({
            'Severity': [severity_mapping[i] for i in severity_counts.index],
            'Value': values,
            'Product': product,
            'TotalReviews': total_reviews[product]  # Add total reviews to the DataFrame
        })
        all_data.append(severity_df)

    plot_df = pd.concat(all_data)
    
    # Ensure proper ordering of severity levels
    severity_order = ['Mild', 'Medium', 'Severe']
    
    fig = px.bar(plot_df, 
                 x='Product', 
                 y='Value',
                 color='Severity',
                 category_orders={'Severity': severity_order},
                 color_discrete_map=color_scheme,
                 barmode='stack',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height'] * 2.0)

    fig.update_layout(
        title=dict(
            text='Distribution of side effect severity by product',
            y=0.95,
            yanchor='bottom',
            font_size=PLOT_PARAMS['title_font_size'],
        ),
        xaxis_title=None,
        yaxis_title='Percentage of Total Reviews',
        template='plotly_white',
        xaxis=dict(tickangle=45),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-.5,
            xanchor='center',
            x=0.5,
            title=None
        )
    )

    # Update hover template
    for severity in severity_order:
        fig.update_traces(
            customdata=plot_df[plot_df['Severity'] == severity][['TotalReviews']],
            hovertemplate=(
                "Product: %{x}<br>" +
                "Severity: " + severity + "<br>" +
                "Percentage: %{y:.1f}%<br>" +
                "Total Reviews: %{customdata[0]:,}" +
                "<extra></extra>"
            ),
            selector=dict(name=severity)
        )
    
    return fig

# Usage:
df = pd.read_csv('csv-files/Kidney Stone Reviews - Summary - Side effects.csv')
fig_severity = create_severity_distribution(df)
fig_severity.show()
# fig_severity.write_html(figfolder + "side-effects-severity-distribution.html")

In [10]:
def create_sunburst_distribution(df, product, use_counts=False):
    """Create a sunburst chart showing the distribution of side effects for a single product."""
    # Filter data for the specific product
    product_data = df[df['Product'] == product]
    
    # Create hierarchy data
    categories = product_data.groupby('Category')['Side effect'].count().reset_index()
    subcategories = product_data.groupby(['Category', 'Subcategory'])['Side effect'].count().reset_index()
    
    # Calculate values based on use_counts parameter
    total_effects = len(product_data)
    if not use_counts:
        categories['Side effect'] = categories['Side effect'] / total_effects * 100
        subcategories['Side effect'] = subcategories['Side effect'] / total_effects * 100
    
    # Prepare data for sunburst chart, ensuring unique names
    # center_label = f"{product} side effects"
    center_label = ""
    
    # Modify labels to ensure uniqueness between categories and subcategories
    subcategories_modified = subcategories.copy()
    subcategories_modified.loc[subcategories_modified['Subcategory'] == 'Other', 'Subcategory'] = \
        subcategories_modified.loc[subcategories_modified['Subcategory'] == 'Other', 'Category'] + ' - Other'
    
    labels = [center_label] + \
            categories['Category'].tolist() + \
            subcategories_modified['Subcategory'].tolist()
            
    parents = [''] + \
             [center_label] * len(categories) + \
             subcategories_modified['Category'].tolist()
             
    values = [total_effects if use_counts else 100] + \
            categories['Side effect'].tolist() + \
            subcategories['Side effect'].tolist()
    
    # Create figure
    fig = go.Figure(go.Sunburst(
        labels=labels,
        parents=parents,
        values=values,
        branchvalues='total',
        hovertemplate=('Category: %{label}<br>' +
                      ('Count: %{value:,.0f}' if use_counts else 'Percentage: %{value:.1f}%') +
                      '<extra></extra>'),
    ))
    
    metric = "counts" if use_counts else "percentage"
    
    fig.update_layout(
        title=dict(
            # text=f'Distribution of side effects for {product}',
            text=None,
            y=0.95,
            yanchor='bottom'
        ),
        width=PLOT_PARAMS['width'] * 2.0,
        height=PLOT_PARAMS['height'] * 2.0,
        margin=PLOT_PARAMS['margin'],
        template='plotly_white',
        title_font_size=PLOT_PARAMS['title_font_size'],
        sunburstcolorway=px.colors.qualitative.Prism
    )
    
    return fig

PLOT_PARAMS = {
    'width': 400,
    'height': 400,
    'margin': dict(l=50, r=50, t=50, b=50),  # increased top margin from 50 to 100
    'title_font_size': 14
}
# Usage:
for product in df['Product'].unique():
    fig = create_sunburst_distribution(df, product, use_counts=True)
    if fig is not None:
        product_file = product.lower().replace(' ', '-')
        filename = f"side-effects-sunburst-counts-{product_file}.html"
        # fig.show()
        fig.write_html(figfolder + filename)

In [71]:
# Create the data
data_10 = {
    'Name': [
        'Cetylated Fatty Acids (CFAs)',
        'Banaba',
        'Black Walnut',
        'Cascara Sagrada',
        'Holy Basil',
        'Marshmallow',
        'Palmitoylethanolamide (PEA)',
        'Apricot Kernel',
        'Arrowroot',
        'Carbon 60 (C60)'
    ],
    'Overall Rating': [5.0, 4.9, 4.9, 4.8, 4.8, 4.8, 4.8, 4.7, 4.7, 4.7],
    'Num. Reviews': [13, 11, 11, 31, 50, 13, 18, 37, 10, 21]
}

data_50 = {
    'Name': [
        'Holy Basil',
        'Elderberry',
        'Lysine',
        'Witch Hazel',
        'Chanca Piedra',
        'Colloidal Silver',
        'Kratom',
        'Mullein',
        'Taurine',
        'Butterbur'
    ],
    'Overall Rating': [4.8, 4.7, 4.7, 4.7, 4.6, 4.6, 4.6, 4.6, 4.6, 4.5],
    'Num. Reviews': [50, 50, 419, 60, 113, 185, 1727, 60, 56, 74]
}

data_100 = {
    'Name': [
        'Lysine',
        'Chanca Piedra',
        'Colloidal Silver',
        'Kratom',
        'D-Mannose',
        'Evening Primrose Oil',
        'Methylsulfonylmethane (MSM)',
        'Oscillococcinum',
        'Dimethylsulfoxide (DMSO)',
        'N-Acetyl Cysteine (NAC)'
    ],
    'Overall Rating': [4.7, 4.6, 4.6, 4.6, 4.5, 4.5, 4.5, 4.5, 4.4, 4.4],
    'Num. Reviews': [419, 113, 185, 1727, 238, 132, 197, 203, 103, 120]
}

min_reviews = 100
variable_name = f"data_{min_reviews}"  # This creates the string "data_10"
df = pd.DataFrame(globals()[variable_name]).reset_index(drop=True)  # This looks up the variable named "data_10"

def style_dataframe(df):
    """Apply consistent styling to the dataframe."""
    return df.style\
        .set_table_styles([
            {'selector': '',
             'props': [('color', '#000000')]},  # Using black text like your other tables
        ])\
        .format({
            'Overall Rating': '{:.1f}',  # Format to 1 decimal place
            'Num. Reviews': '{:,}'       # Add thousands separator if needed
        }).hide(axis='index')

# Style and save the table
styled_df = style_dataframe(df)
styled_df.to_html(figfolder + 'supplements-rating-table-' + str(min_reviews) + '.html', index=False)
display(styled_df)

Name,Overall Rating,Num. Reviews
Lysine,4.7,419
Chanca Piedra,4.6,113
Colloidal Silver,4.6,185
Kratom,4.6,1727
D-Mannose,4.5,238
Evening Primrose Oil,4.5,132
Methylsulfonylmethane (MSM),4.5,197
Oscillococcinum,4.5,203
Dimethylsulfoxide (DMSO),4.4,103
N-Acetyl Cysteine (NAC),4.4,120
