In [77]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import numpy as np

In [78]:
PLOT_PARAMS = {
    'width': 400,
    'height': 200,
    'margin': dict(l=50, r=50, t=50, b=50),
    'title_font_size': 14
}

figfolder = "docs/"
export_to_gdoc = False

def get_text_color():
    return 'black' if export_to_gdoc else 'white'

def create_rating_distribution(df, medicine):
    """Create a bar plot for rating distribution of a single medicine."""
    bins = [1.0, 2.0, 3.0, 4.0, 5.0, 5.1]
    counts, _ = np.histogram(df[df['Medicine'] == medicine]['Overall Rating'], bins=bins)
    
    fig = go.Figure(data=[
        go.Bar(
            x=[1, 2, 3, 4, 5],
            y=counts,
            width=0.8,
            hovertemplate=(
                "Overall Rating: %{customdata}<br>" +
                "Count: %{y}" +
                "<extra></extra>"
            ),
            customdata=["1-1.9", "2-2.9", "3-3.9", "4-4.9", "5"]
        )
    ])

    fig.update_layout(
        title=f'Distribution of Ratings: {medicine}',
        xaxis_title='Rating',
        yaxis_title='Count',
        template='plotly_white',
        width=PLOT_PARAMS['width'],
        height=PLOT_PARAMS['height'],
        bargap=0.1,
        margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(
            tickmode='array',
            tickvals=[1, 2, 3, 4, 5],
            ticktext=['1', '2', '3', '4', '5'],
            range=[0.5, 5.5]
        )
    )
    
    return fig

In [79]:
def create_grouped_rating_distribution(df):
    """Create a grouped bar plot comparing rating distributions across medicines."""
    all_data = []
    for medicine in sorted(df['Medicine'].unique()):
        medicine_data = df[df['Medicine'] == medicine]
        bins = [1.0, 2.0, 3.0, 4.0, 5.0, 5.1]
        counts, _ = np.histogram(medicine_data['Overall Rating'], bins=bins)
        
        percentages = (counts / len(medicine_data)) * 100
        ratings_df = pd.DataFrame({
            'Rating': [1, 2, 3, 4, 5],
            'Percentage': percentages,
            'Medicine': medicine,
            'RatingRange': ['[1,2)', '[2,3)', '[3,4)', '[4,5)', '5']
        })
        all_data.append(ratings_df)

    plot_df = pd.concat(all_data)
    
    fig = px.bar(plot_df, 
                 x='Rating', 
                 y='Percentage',
                 color='Medicine',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height']* 2.0,)

    fig.update_layout(
        title='Distribution of ratings by product (percentage)',
        xaxis_title='Rating',
        yaxis_title='Percentage of Reviews',
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(
            tickmode='array',
            tickvals=[1, 2, 3, 4, 5],
            ticktext=['[1,2)', '[2,3)', '[3,4)', '[4,5)', '5'],
            range=[0.5, 5.5]
        ),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-0.25,
            xanchor='center',
            x=0.5,
            title=None
        ),
        # autosize=True,
    )

    for medicine_name in plot_df['Medicine'].unique():
        medicine_data = plot_df[plot_df['Medicine'] == medicine_name]
        fig.update_traces(
            customdata=medicine_data[['Medicine', 'RatingRange']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Rating: %{customdata[1]}<br>" +
                "Percentage: %{y:.1f}%" +
                "<extra></extra>"
            ),
            selector=dict(name=medicine_name)
        )
    
    return fig

In [80]:
def calculate_stats(df, columns_to_analyze, medicines_to_exclude=None, medicine_mapping=None):
    """Calculate statistics for the given dataframe with configurable exclusions and mappings."""
    if medicines_to_exclude is None:
        medicines_to_exclude = []
    if medicine_mapping is None:
        medicine_mapping = {}

    summary = df.groupby('Medicine').agg(
        total_reviews=('Medicine', 'size'),
        **{
            f'{col}_positive': (col, lambda x: sum(
                1 for val in x if val == 1 or (isinstance(val, str) and val.strip())
            ))
            for col in columns_to_analyze
        },
        **{
            f'{col}_negative': (col, lambda x: sum(
                1 for val in x if val == 0
            ))
            for col in columns_to_analyze
        }
    ).reset_index()

    for col in columns_to_analyze:
        summary[f'{col}_positive_pct'] = (summary[f'{col}_positive'] / summary['total_reviews'] * 100).round(1)
        summary[f'{col}_negative_pct'] = (summary[f'{col}_negative'] / summary['total_reviews'] * 100).round(1)
        summary[f'{col}_no_info_pct'] = (
            (summary['total_reviews'] - summary[f'{col}_positive'] - summary[f'{col}_negative']) 
            / summary['total_reviews'] * 100
        ).round(1)
    
    summary = summary[~summary['Medicine'].isin(medicines_to_exclude)]
    
    transposed_data = {
        ' ': ['<b>Total Reviews</b>'] + 
            sum([[f'<b>{col}</b>', 
                 '    Yes', 
                 '    No', 
                 '    No information'] 
                for col in columns_to_analyze], []),
        **{medicine_mapping.get(row['Medicine'], row['Medicine']): [
            row['total_reviews']] + 
            sum([[
                '',
                row[f'{col}_positive_pct'],
                row[f'{col}_negative_pct'],
                row[f'{col}_no_info_pct']
            ] for col in columns_to_analyze], [])
           for _, row in summary.iterrows()}
    }
    
    display_df = pd.DataFrame(transposed_data).set_index(' ')
    return display_df

In [81]:
def style_dataframe(df):
    """Apply consistent styling to the dataframe."""
    return df.style\
        .set_table_styles([
            {'selector': '',
             'props': [('color', get_text_color())]},
        ])\
        .format(lambda x: f'{x:,.0f}' if isinstance(x, (int, float)) and str(x).isdigit() 
                else (f'{x:.0f}%' if isinstance(x, float) and x.is_integer() 
                else f'{x:.1f}%' if isinstance(x, float) 
                else x))

# 1. WebMD reviews

In [82]:
df = pd.read_csv('csv-files/Kidney Stone Reviews - Reviews - WebMD.csv')
print(f"Successfully loaded {len(df)} rows of data")

Successfully loaded 1567 rows of data


In [83]:
# for medicine in df['Medicine'].unique():
#     fig = create_rating_distribution(df, medicine)
#     fig.show()

# Create grouped rating distribution
fig = create_grouped_rating_distribution(df)
fig.show()
figname = "webmd-treatment-distribution-rating"
fig.write_html(figfolder+figname+".html")
# fig.write_image(figfolder+figname+".png", scale=4)

In [84]:
columns_to_analyze = [
    'Helps overall with kidney stones',
    'Works as a prophylactic',
    'Side effects mentioned',
    'Asserts significant pain reduction',
    'Mentions breaking of stones',
    'Mentions shrinking of the stones',
    'Mentions softening of stones',
    'Stone passed with no or almost no pain',
]

webmd_medicine_mapping = {
    'Hydrochlorothiazide': 'HCTZ',
    'Potassium Citrate': 'Potassium cit.'
}

webmd_medicines_to_exclude = ['Ashwagandha', 'Melatonin']
# webmd_medicines_to_exclude = []

# Generate and display statistics
display_df = calculate_stats(
    df, 
    columns_to_analyze,
    medicines_to_exclude=webmd_medicines_to_exclude,
    medicine_mapping=webmd_medicine_mapping
)
display(style_dataframe(display_df))

tablename = "webmd-table-full-analysis"
display_df.to_html(figfolder + tablename + ".html")

Unnamed: 0,Allopurinol,Black seed,Chanca piedra,Flomax,Garcinia,HCTZ,Potassium citrate
,,,,,,,
Total Reviews,13,77,87,22,892,20,24
Helps overall with kidney stones,,,,,,,
Yes,38.5%,0%,85.1%,9.1%,0%,35%,41.7%
No,7.7%,0%,3.4%,22.7%,0%,30%,8.3%
No information,53.8%,100%,11.5%,68.2%,100%,35%,50%
Works as a prophylactic,,,,,,,
Yes,38.5%,0%,11.5%,0%,0%,30%,37.5%
No,0%,0%,0%,0%,0%,0%,4.2%
No information,61.5%,100%,88.5%,100%,100%,70%,58.3%


# 2. Amazon Reviews Analysis

In [85]:
def create_chanca_piedra_df():
    """Create a DataFrame with Chanca piedra statistics from the separate CSV file."""
    chanca_df = pd.read_csv('csv-files/Kidney Stone Reviews - WebMD Amazon Chanca Piedra.csv')
    
    num_reviews = 1193
    # Create a DataFrame in the format expected by calculate_stats
    medicine_data = {
        'Medicine': ['Chanca piedra'] * num_reviews,
    }
    
    # Add columns based on the percentages in the CSV
    for col in columns_to_analyze:
        # Initialize arrays for each metric
        col_data = [0] * num_reviews
        
        # Get percentages from the CSV
        yes_pct = float(chanca_df[chanca_df['Metric'] == 'Yes']['Chanca piedra'].iloc[0].strip('%')) / 100
        no_pct = float(chanca_df[chanca_df['Metric'] == 'No']['Chanca piedra'].iloc[0].strip('%')) / 100
        
        # Calculate how many should be 1s and 0s based on percentages
        yes_count = int(num_reviews * yes_pct)
        no_count = int(num_reviews * no_pct)
        
        # Fill in the arrays
        col_data[:yes_count] = [1] * yes_count
        col_data[yes_count:yes_count + no_count] = [0] * no_count
        
        medicine_data[col] = col_data
    
    return pd.DataFrame(medicine_data)

# Modify your existing code:
dfa = pd.read_csv('csv-files/Kidney Stone Reviews - Reviews - Amazon.csv')

# First, remove the original Chanca piedra data
dfa = dfa[dfa['Medicine'] != 'Chanca piedra']  # Add this line

chanca_df = create_chanca_piedra_df()

# Combine the DataFrames
combined_df = pd.concat([dfa, chanca_df], ignore_index=True)

In [86]:
amazon_medicine_mapping = {
}

amazon_medicines_to_exclude = ['Chanca piedra']

# Remove Chanca piedra from exclusion list if it's there
if 'Chanca piedra' in amazon_medicines_to_exclude:
    amazon_medicines_to_exclude.remove('Chanca piedra')

# Generate and display statistics
display_df = calculate_stats(
    combined_df, 
    columns_to_analyze,
    medicines_to_exclude=amazon_medicines_to_exclude,
    medicine_mapping=amazon_medicine_mapping
)
display(style_dataframe(display_df))
tablename = "amazon-table-full-analysis"
display_df.to_html(figfolder + tablename + ".html")

Unnamed: 0,Chanca piedra,Phosfood,Potassium citrate,Rowatinex
,,,,
Total Reviews,1193,40,133,90
Helps overall with kidney stones,,,,
Yes,71.9%,75%,91%,90%
No,28.1%,10%,0%,2.2%
No information,0%,15%,9%,7.8%
Works as a prophylactic,,,,
Yes,71.9%,22.5%,66.9%,25.6%
No,28.1%,5%,0%,1.1%
No information,0%,72.5%,33.1%,73.3%


In [87]:
def calculate_stats_by_rating(df, columns_to_analyze, medicine_name, ratings, sources_to_exclude=None):
    """Calculate statistics for specific ratings of a medicine."""
    rating_dfs = {}
    
    for rating in ratings:
        filtered_df = df[(df['Medicine'] == medicine_name) & (df['Stars'] == rating)]
        
        if sources_to_exclude:
            filtered_df = filtered_df[~filtered_df['Source'].isin(sources_to_exclude)]
        
        summary = pd.DataFrame({
            'Medicine': [f'{medicine_name} ({rating}-star reviews)'],
            'total_reviews': [len(filtered_df)]
        })
        
        for col in columns_to_analyze:
            summary[f'{col}_positive'] = [sum(
                1 for val in filtered_df[col] if val == 1 or (isinstance(val, str) and val.strip())
            )]
            summary[f'{col}_negative'] = [sum(
                1 for val in filtered_df[col] if val == 0
            )]
            
            total = summary['total_reviews'].iloc[0]
            if total > 0:
                summary[f'{col}_positive_pct'] = (summary[f'{col}_positive'] / total * 100).round(1)
                summary[f'{col}_negative_pct'] = (summary[f'{col}_negative'] / total * 100).round(1)
                summary[f'{col}_no_info_pct'] = (
                    (total - summary[f'{col}_positive'] - summary[f'{col}_negative']) 
                    / total * 100
                ).round(1)
            else:
                summary[f'{col}_positive_pct'] = 0
                summary[f'{col}_negative_pct'] = 0
                summary[f'{col}_no_info_pct'] = 0
                
        rating_dfs[rating] = summary
    
    transposed_data = {
        ' ': ['<b>Total Reviews</b>'] + 
            sum([[f'<b>{col}</b>', 
                 '    Yes', 
                 '    No', 
                 '    No information'] 
                for col in columns_to_analyze], [])
    }
    
    for rating in ratings:
        summary = rating_dfs[rating]
        transposed_data[summary['Medicine'].iloc[0]] = [
            summary['total_reviews'].iloc[0]] + sum([[
                '',
                summary[f'{col}_positive_pct'].iloc[0],
                summary[f'{col}_negative_pct'].iloc[0],
                summary[f'{col}_no_info_pct'].iloc[0]
            ] for col in columns_to_analyze], [])
    
    return pd.DataFrame(transposed_data).set_index(' ')


In [88]:
sources_to_exclude = ['Alerna: Chanca Piedra 500mg', 'EU Natural: "Stone Breaker" chanca piedra', 'Herb Pharm Chanca Piedra Liquid Extract 1Fl Oz', 'Peruvian Naturals Chanca Piedra Capsules', 'Activa Naturals Chanca Piedra', 'Carlyle Chanca Piedra', 'CNP Organic Chanca Piedra Concentrate & Extract', 'Hanan Chanca Piedra Tea']
# sources_to_exclude = ['Alerna: Chanca Piedra 500mg', 'NaturalisimoLife Chanca Piedra 1600 mg', 'Herb Pharm Chanca Piedra Liquid Extract 1Fl Oz', 'Peruvian Naturals Chanca Piedra Capsules', 'Carlyle Chanca Piedra', 'CNP Organic Chanca Piedra Concentrate & Extract', 'Hanan Chanca Piedra Tea']
# sources_to_exclude = ['Alerna: Chanca Piedra 500mg', 'NaturalisimoLife Chanca Piedra 1600 mg', 'EU Natural: "Stone Breaker" chanca piedra', 'Peruvian Naturals Chanca Piedra Capsules', 'Carlyle Chanca Piedra', 'CNP Organic Chanca Piedra Concentrate & Extract', 'Hanan Chanca Piedra Tea']
# sources_to_exclude = []

ratings_display_df = calculate_stats_by_rating(
    dfa, 
    columns_to_analyze, 
    'Chanca piedra', 
    [1, 2, 3, 4, 5],
    sources_to_exclude=sources_to_exclude
)
display(style_dataframe(ratings_display_df))

Unnamed: 0,Chanca piedra (1-star reviews),Chanca piedra (2-star reviews),Chanca piedra (3-star reviews),Chanca piedra (4-star reviews),Chanca piedra (5-star reviews)
,,,,,
Total Reviews,0.0,0.0,0.0,0.0,0.0
Helps overall with kidney stones,,,,,
Yes,0.0,0.0,0.0,0.0,0.0
No,0.0,0.0,0.0,0.0,0.0
No information,0.0,0.0,0.0,0.0,0.0
Works as a prophylactic,,,,,
Yes,0.0,0.0,0.0,0.0,0.0
No,0.0,0.0,0.0,0.0,0.0
No information,0.0,0.0,0.0,0.0,0.0


# 3. Reddit posts

In [89]:
dfr = pd.read_csv('csv-files/Kidney Stone Reviews - Reviews - Reddit.csv')
print(f"Successfully loaded {len(dfr)} rows of data")

Successfully loaded 2308 rows of data


In [92]:
reddit_medicine_mapping = {
    'Hydrochlorothiazide': 'HCTZ',
}

reddit_medicines_to_exclude = []

# Generate and display statistics
display_df = calculate_stats(
    dfr, 
    columns_to_analyze,
    medicines_to_exclude=reddit_medicines_to_exclude,
    medicine_mapping=reddit_medicine_mapping
)
display(style_dataframe(display_df))
tablename = "reddit-table-full-analysis"
display_df.to_html(figfolder + tablename + ".html")

Unnamed: 0,Allopurinol,Black seed,Chanca piedra,Flomax,Garcinia,HCTZ,Phosfood,Potassium citrate,Rowatinex
,,,,,,,,,
Total Reviews,49,17,492,1134,38,46,2,509,21
Helps overall with kidney stones,,,,,,,,,
Yes,20.4%,23.5%,35.2%,24.6%,42.1%,32.6%,100%,22%,23.8%
No,2%,5.9%,9.8%,11.7%,0%,4.3%,0%,8.6%,14.3%
No information,77.6%,70.6%,55.1%,63.7%,57.9%,63%,0%,69.4%,61.9%
Works as a prophylactic,,,,,,,,,
Yes,18.4%,11.8%,4.5%,0.5%,13.2%,23.9%,0%,11.2%,0%
No,0%,0%,0.4%,0.1%,0%,0%,0%,2.4%,9.5%
No information,81.6%,88.2%,95.1%,99.4%,86.8%,76.1%,100%,86.4%,90.5%


# 4. Side effects

In [93]:
def create_category_distribution(df, use_counts=False):
    """Create a grouped bar plot comparing side effect category distributions across medicines.
    
    Args:
        df: DataFrame with the side effects data
        use_counts: If True, show actual counts. If False, show percentages.
    """
    all_data = []
    for product in sorted(df['Product'].unique()):
        product_data = df[df['Product'] == product]
        category_counts = product_data['Category'].value_counts()
        
        if use_counts:
            values = category_counts.values
        else:
            values = (category_counts / len(product_data)) * 100
            
        category_df = pd.DataFrame({
            'Category': category_counts.index,
            'Value': values,
            'Product': product
        })
        all_data.append(category_df)

    plot_df = pd.concat(all_data)
    
    fig = px.bar(plot_df, 
                 x='Category', 
                 y='Value',
                 color='Product',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height'] * 2.0)

    # Determine title and axis labels based on the mode
    metric = "total count" if use_counts else "percentage"
    y_axis_label = "Number of Side Effects" if use_counts else "Percentage of Side Effects"
    
    fig.update_layout(
        title=dict(
            text=f'Distribution of side effect categories by product ({metric})',
            y=0.95,
            yanchor='bottom'
        ),
        xaxis_title= None,
        yaxis_title=y_axis_label,
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        # margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(tickangle=45),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-.7,
            xanchor='center',
            x=0.5,
            title=None
        )
    )

    for product_name in plot_df['Product'].unique():
        product_data = plot_df[plot_df['Product'] == product_name]
        hover_value = "Count: %{y}" if use_counts else "Percentage: %{y:.1f}%"
        fig.update_traces(
            customdata=product_data[['Product', 'Category']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Category: %{customdata[1]}<br>" +
                hover_value +
                "<extra></extra>"
            ),
            selector=dict(name=product_name)
        )
    
    return fig

def create_subcategory_distribution(df, category, use_counts=False):
    """Create a grouped bar plot for subcategories within a specific category.
    
    Args:
        df: DataFrame with the side effects data
        category: The main category to analyze
        use_counts: If True, show actual counts. If False, show percentages.
    """
    category_data = df[df['Category'] == category]
    
    all_data = []
    for product in sorted(category_data['Product'].unique()):
        product_data = category_data[category_data['Product'] == product]
        subcategory_counts = product_data['Subcategory'].value_counts()
        
        if use_counts:
            values = subcategory_counts.values
        else:
            # Calculate percentage relative to total side effects for the product
            values = (subcategory_counts / len(df[df['Product'] == product])) * 100
            
        subcategory_df = pd.DataFrame({
            'Subcategory': subcategory_counts.index,
            'Value': values,
            'Product': product
        })
        all_data.append(subcategory_df)

    plot_df = pd.concat(all_data)
    
    fig = px.bar(plot_df, 
                 x='Subcategory', 
                 y='Value',
                 color='Product',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height'] * 2.0)

    # Determine title and axis labels based on the mode
    metric = "total count" if use_counts else "percentage"
    y_axis_label = "Number of Side Effects" if use_counts else "Percentage of Side Effects"
    
    fig.update_layout(
        title=dict(
            text=f'Distribution of {category} subcategories by product ({metric})',
            y=0.95,
            yanchor='bottom'
        ),
        xaxis_title=None,
        yaxis_title=y_axis_label,
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        # margin=PLOT_PARAMS['margin'],
        title_font_size=PLOT_PARAMS['title_font_size'],
        xaxis=dict(tickangle=45),
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-0.7,
            xanchor='center',
            x=0.5,
            title=None
        )
    )

    for product_name in plot_df['Product'].unique():
        product_data = plot_df[plot_df['Product'] == product_name]
        hover_value = "Count: %{y}" if use_counts else "Percentage: %{y:.1f}%"
        fig.update_traces(
            customdata=product_data[['Product', 'Subcategory']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Subcategory: %{customdata[1]}<br>" +
                hover_value +
                "<extra></extra>"
            ),
            selector=dict(name=product_name)
        )
    
    return fig

def sanitize_filename(category):
    """Convert category name to filename-friendly format."""
    # Replace forward slashes and other special characters with hyphens
    sanitized = category.lower().replace('/', '-').replace(' ', '-')
    # Remove any other potentially problematic characters
    sanitized = ''.join(c for c in sanitized if c.isalnum() or c == '-')
    return sanitized

PLOT_PARAMS = {
    'width': 400,
    'height': 200,
    'margin': dict(l=50, r=50, t=50, b=50),
    'title_font_size': 14
}

# Usage:
df = pd.read_csv('csv-files/Kidney Stone Reviews - Summary - Side effects.csv')
# Create main category distribution
use_counts = True
metric = "total-count" if use_counts else "percentage"
fig_categories = create_category_distribution(df, use_counts=use_counts)
fig_categories.show()
fig_categories.write_html(figfolder + "side-effects-category-distribution-" + metric + ".html")

for use_counts in [True, False]:
    metric = "total-count" if use_counts else "percentage"
    
    # Create and save main category distribution
    fig_categories = create_category_distribution(df, use_counts=use_counts)
    fig_categories.write_html(figfolder + f"side-effects-category-distribution-{metric}.html")
    
    # Get all unique categories and create plots for each
    categories = sorted(df['Category'].unique())
    
    for category in categories:
        # Create filename-friendly version of category name
        category_filename = sanitize_filename(category)
        
        # Create and save subcategory distribution
        fig_subcategories = create_subcategory_distribution(df, category, use_counts=use_counts)
        filename = f"side-effects-{category_filename}-subcategories-{metric}.html"
        # fig_subcategories.write_html(
        #     figfolder + f"side-effects-{category_filename}-subcategories-{metric}.html"
        # )

In [None]:
def create_severity_distribution(df):
    """Create a grouped bar plot comparing severity distributions across medicines.
    
    Args:
        df: DataFrame with the side effects data
    """
    all_data = []
    for product in sorted(df['Product'].unique()):
        product_data = df[df['Product'] == product]
        severity_counts = product_data['Severity'].value_counts()
        
        # Calculate percentages
        values = (severity_counts / len(product_data)) * 100
            
        severity_df = pd.DataFrame({
            'Severity': severity_counts.index,
            'Value': values,
            'Product': product
        })
        all_data.append(severity_df)

    plot_df = pd.concat(all_data)
    
    # Sort severity values to ensure consistent ordering
    plot_df['Severity'] = plot_df['Severity'].astype(str)
    
    fig = px.bar(plot_df, 
                 x='Severity', 
                 y='Value',
                 color='Product',
                 barmode='group',
                 width=PLOT_PARAMS['width'] * 2.0,
                 height=PLOT_PARAMS['height'] * 2.0)

    fig.update_layout(
        title=dict(
            text='Distribution of side effect severity by product (percentage)',
            y=0.95,
            yanchor='bottom'
        ),
        xaxis_title='Severity Level',
        yaxis_title='Percentage of Side Effects',
        template='plotly_white',
        bargap=0.15,
        bargroupgap=0.1,
        title_font_size=PLOT_PARAMS['title_font_size'],
        legend=dict(
            orientation='h',
            yanchor='top',
            y=-.7,
            xanchor='center',
            x=0.5,
            title=None
        )
    )

    for product_name in plot_df['Product'].unique():
        product_data = plot_df[plot_df['Product'] == product_name]
        fig.update_traces(
            customdata=product_data[['Product', 'Severity']].values,
            hovertemplate=(
                "%{customdata[0]}<br>" +
                "Severity: %{customdata[1]}<br>" +
                "Percentage: %{y:.1f}%" +
                "<extra></extra>"
            ),
            selector=dict(name=product_name)
        )
    
    return fig

# Usage:
df = pd.read_csv('csv-files/Kidney Stone Reviews - Summary - Side effects.csv')
fig_severity = create_severity_distribution(df)
fig_severity.show()
# fig_severity.write_html(figfolder + "side-effects-severity-distribution.html")

In [10]:
def create_sunburst_distribution(df, product, use_counts=False):
    """Create a sunburst chart showing the distribution of side effects for a single product."""
    # Filter data for the specific product
    product_data = df[df['Product'] == product]
    
    # Create hierarchy data
    categories = product_data.groupby('Category')['Side effect'].count().reset_index()
    subcategories = product_data.groupby(['Category', 'Subcategory'])['Side effect'].count().reset_index()
    
    # Calculate values based on use_counts parameter
    total_effects = len(product_data)
    if not use_counts:
        categories['Side effect'] = categories['Side effect'] / total_effects * 100
        subcategories['Side effect'] = subcategories['Side effect'] / total_effects * 100
    
    # Prepare data for sunburst chart, ensuring unique names
    # center_label = f"{product} side effects"
    center_label = ""
    
    # Modify labels to ensure uniqueness between categories and subcategories
    subcategories_modified = subcategories.copy()
    subcategories_modified.loc[subcategories_modified['Subcategory'] == 'Other', 'Subcategory'] = \
        subcategories_modified.loc[subcategories_modified['Subcategory'] == 'Other', 'Category'] + ' - Other'
    
    labels = [center_label] + \
            categories['Category'].tolist() + \
            subcategories_modified['Subcategory'].tolist()
            
    parents = [''] + \
             [center_label] * len(categories) + \
             subcategories_modified['Category'].tolist()
             
    values = [total_effects if use_counts else 100] + \
            categories['Side effect'].tolist() + \
            subcategories['Side effect'].tolist()
    
    # Create figure
    fig = go.Figure(go.Sunburst(
        labels=labels,
        parents=parents,
        values=values,
        branchvalues='total',
        hovertemplate=('Category: %{label}<br>' +
                      ('Count: %{value:,.0f}' if use_counts else 'Percentage: %{value:.1f}%') +
                      '<extra></extra>'),
    ))
    
    metric = "counts" if use_counts else "percentage"
    
    fig.update_layout(
        title=dict(
            # text=f'Distribution of side effects for {product}',
            text=None,
            y=0.95,
            yanchor='bottom'
        ),
        width=PLOT_PARAMS['width'] * 2.0,
        height=PLOT_PARAMS['height'] * 2.0,
        margin=PLOT_PARAMS['margin'],
        template='plotly_white',
        title_font_size=PLOT_PARAMS['title_font_size'],
        sunburstcolorway=px.colors.qualitative.Prism
    )
    
    return fig

PLOT_PARAMS = {
    'width': 400,
    'height': 400,
    'margin': dict(l=50, r=50, t=50, b=50),  # increased top margin from 50 to 100
    'title_font_size': 14
}
# Usage:
for product in df['Product'].unique():
    fig = create_sunburst_distribution(df, product, use_counts=True)
    if fig is not None:
        product_file = product.lower().replace(' ', '-')
        filename = f"side-effects-sunburst-counts-{product_file}.html"
        # fig.show()
        fig.write_html(figfolder + filename)

In [71]:
# Create the data
data_10 = {
    'Name': [
        'Cetylated Fatty Acids (CFAs)',
        'Banaba',
        'Black Walnut',
        'Cascara Sagrada',
        'Holy Basil',
        'Marshmallow',
        'Palmitoylethanolamide (PEA)',
        'Apricot Kernel',
        'Arrowroot',
        'Carbon 60 (C60)'
    ],
    'Overall Rating': [5.0, 4.9, 4.9, 4.8, 4.8, 4.8, 4.8, 4.7, 4.7, 4.7],
    'Num. Reviews': [13, 11, 11, 31, 50, 13, 18, 37, 10, 21]
}

data_50 = {
    'Name': [
        'Holy Basil',
        'Elderberry',
        'Lysine',
        'Witch Hazel',
        'Chanca Piedra',
        'Colloidal Silver',
        'Kratom',
        'Mullein',
        'Taurine',
        'Butterbur'
    ],
    'Overall Rating': [4.8, 4.7, 4.7, 4.7, 4.6, 4.6, 4.6, 4.6, 4.6, 4.5],
    'Num. Reviews': [50, 50, 419, 60, 113, 185, 1727, 60, 56, 74]
}

data_100 = {
    'Name': [
        'Lysine',
        'Chanca Piedra',
        'Colloidal Silver',
        'Kratom',
        'D-Mannose',
        'Evening Primrose Oil',
        'Methylsulfonylmethane (MSM)',
        'Oscillococcinum',
        'Dimethylsulfoxide (DMSO)',
        'N-Acetyl Cysteine (NAC)'
    ],
    'Overall Rating': [4.7, 4.6, 4.6, 4.6, 4.5, 4.5, 4.5, 4.5, 4.4, 4.4],
    'Num. Reviews': [419, 113, 185, 1727, 238, 132, 197, 203, 103, 120]
}

min_reviews = 100
variable_name = f"data_{min_reviews}"  # This creates the string "data_10"
df = pd.DataFrame(globals()[variable_name]).reset_index(drop=True)  # This looks up the variable named "data_10"

def style_dataframe(df):
    """Apply consistent styling to the dataframe."""
    return df.style\
        .set_table_styles([
            {'selector': '',
             'props': [('color', '#000000')]},  # Using black text like your other tables
        ])\
        .format({
            'Overall Rating': '{:.1f}',  # Format to 1 decimal place
            'Num. Reviews': '{:,}'       # Add thousands separator if needed
        }).hide(axis='index')

# Style and save the table
styled_df = style_dataframe(df)
styled_df.to_html(figfolder + 'supplements-rating-table-' + str(min_reviews) + '.html', index=False)
display(styled_df)

Name,Overall Rating,Num. Reviews
Lysine,4.7,419
Chanca Piedra,4.6,113
Colloidal Silver,4.6,185
Kratom,4.6,1727
D-Mannose,4.5,238
Evening Primrose Oil,4.5,132
Methylsulfonylmethane (MSM),4.5,197
Oscillococcinum,4.5,203
Dimethylsulfoxide (DMSO),4.4,103
N-Acetyl Cysteine (NAC),4.4,120
