In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
# Load the CSV file, without skipping any rows initially
file_path = "/Users/javanmardi/Work/IGSB/Bone2Gene_Survey_Study/3_progressive_results/B2G Survey_9_29_2024.csv"
initial_df = pd.read_csv(file_path, sep=';', header=None)

In [None]:
# Set the first row as the header of the dataframe
headers = initial_df.iloc[0]
questions = initial_df.iloc[1]
header_question_dict = dict(zip(headers, questions))

In [None]:
# Now, read the CSV again, this time skipping the first three rows and setting the first row as header
data_df = pd.read_csv(file_path, sep=';', skiprows=[1, 2])

# Show the first few rows of the dataframe to verify
# print("\nData Preview:")
data_df.head()

In [None]:
#the finished surveys

finished_df=data_df[data_df['Finished']==True]
finished_df.shape

In [None]:
# Identify empty columns
empty_columns = finished_df.isna().all()

# Drop empty columns
data = finished_df.drop(columns=empty_columns[empty_columns].index)

In [None]:
header_question_dict['Q19']

In [None]:
fig = px.histogram(data, x='Q19')
fig.show()

In [None]:
# Function to calculate frequency of each option
def calculate_frequency(df, column_name):
    # Split the responses into individual options
    all_options = df[column_name].str.split(',').explode()
    # Count the frequency of each option
    frequency = all_options.value_counts().reset_index()
    frequency.columns = ['Option', 'Frequency']
    return frequency


# Calculate frequency
frequency_df = calculate_frequency(data, 'Q19')

# Sort the dataframe by Frequency in descending order
frequency_df = frequency_df.sort_values(by='Frequency', ascending=False)

In [None]:
frequency_df

In [None]:
fig = px.bar(frequency_df, y="Option", x="Frequency", orientation='h')

# Update layout to reverse the order
fig.update_layout(
    yaxis=dict(
        categoryorder='total ascending'  # This will reverse the order
    ),
    width=800,   # Set the width of the figure
    height=600   # Set the height of the figure
)

fig.show()

# Plot

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure(
    go.Bar(x=frequency_df['Frequency'], y=frequency_df['Option'], orientation='h', showlegend=False, 
           text=frequency_df['Frequency'], textposition='auto', textfont=dict(size=24))

)

bar_chart_1_order = ['Greater than 18 years old', '10 to 18 years old','1 to 10 years old','Neonates and infants (0-1 year old)']
fig.update_yaxes(categoryorder='array', categoryarray=bar_chart_1_order)


fig.update_xaxes(showgrid=True, zeroline=True, showline=True, linewidth=1, linecolor='black', mirror=False, tickfont=dict(size=24), visible=False)
fig.update_yaxes(showgrid=True, zeroline=True, showline=False, linewidth=1, linecolor='black', mirror=False, tickfont=dict(size=24))

fig.update_layout(
    width=800,   # Set the width of the figure
    height=600   # Set the height of the figure
)

fig.update_layout(
    paper_bgcolor="white",
    plot_bgcolor="white",
    title_text="Patients Age Groups",  # Add a title to the figure
    title_x=0.5,  # Center the title horizontally
    title_font=dict(size=30),
    margin=dict(t=50, l=50, r=50, b=50)
)

fig.show()
fig.write_image("Patients_age_group.png", width=800, height=600, scale=6)

# Other approaches

In [None]:
# Calculate frequency of values in the column
Q19_value_counts = data['Q19'].value_counts().reset_index()
Q19_value_counts.columns = ['Value', 'Count']
Q19_value_counts

In [None]:
Q19_value_counts.to_csv('Scratch/patient_age_group_responses.txt', index=False)

In [None]:
# import plotly.io as pio

# # Set the renderer to 'notebook' or 'iframe' for Jupyter
# pio.renderers.default = 'png'

### Try Treemap 

In [None]:
import pandas as pd
import plotly.express as px

# Sample DataFrame based on the data you provided
data = {
    'Value': [
        "Neonates and infants (0-1 year old),1 to 10 years old,10 to 18 years old,Greater than 18 years old",
        "Neonates and infants (0-1 year old),1 to 10 years old,10 to 18 years old",
        "Greater than 18 years old",
        "10 to 18 years old,Greater than 18 years old",
        "1 to 10 years old,10 to 18 years old",
        "Neonates and infants (0-1 year old),Greater than 18 years old",
        "1 to 10 years old,10 to 18 years old,Greater than 18 years old",
        "Neonates and infants (0-1 year old)",
        "10 to 18 years old"
    ],
    'Count': [50, 34, 7, 5, 4, 1, 1, 1, 1]
}

df = pd.DataFrame(data)

# Split the 'Value' column into a list of age groups for each row to create hierarchical paths
df['Value'] = df['Value'].str.split(',')

# Create separate columns for each level in the hierarchy and replace missing levels with a placeholder
df['Level_1'] = df['Value'].apply(lambda x: x[0] if len(x) > 0 else None)
df['Level_2'] = df['Value'].apply(lambda x: x[1] if len(x) > 1 else '')
df['Level_3'] = df['Value'].apply(lambda x: x[2] if len(x) > 2 else '')
df['Level_4'] = df['Value'].apply(lambda x: x[3] if len(x) > 3 else '')

# Create the Treemap using Plotly Express
fig = px.treemap(df, 
                 path=['Level_1', 'Level_2', 'Level_3', 'Level_4'],  # Define the hierarchical path
                 values='Count',   # Use the 'Count' column for the size of each section
                 title="Age Group Combinations in Survey Responses")

# Show the plot
fig.show()
fig.write_image("Patients_age_group_Treemap.png", width=800, height=600, scale=6)