In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import plugins
import branca.colormap as cm
import json
import matplotlib.colors as mcolors
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objects as go
from scipy import stats


In [56]:
df=pd.read_csv('../data/etw-gosaba.csv',encoding='utf-8')

In [57]:

with open('../data/pp_villages_pc11.geojson', 'r') as f:
    geojson_data = json.load(f)

In [58]:
import matplotlib.colors as mcolors

def rgba_to_hex(rgba):
    """Convert RGBA tuple to hex color string."""
    return mcolors.to_hex(rgba)

In [59]:


exclude_cols = ['_বাড়ির অবস্থান _latitude', '_বাড়ির অবস্থান _longitude','_বাড়ির অবস্থান _precision', '_বাড়ির অবস্থান _altitude',
                '_id', '_uuid', '_submission_time', '_validation_status',
                '_notes', '_status', '_submitted_by', '__version__', '_tags', '_index','start','end']
plot_columns = [col for col in df.columns if col not in exclude_cols]

def create_color_map(data):
    """Create a color map based on data type"""
    if pd.api.types.is_numeric_dtype(data):
        # Calculate quartiles and IQR
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1

        # Define outlier boundaries
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Create bounds using a combination of outlier boundaries and percentiles
        bounds = [
            data.min(),  # Minimum value
            lower_bound,  # Lower outlier boundary
            Q1,          # 25th percentile
            data.median(),  # Median
            Q3,          # 75th percentile
            upper_bound, # Upper outlier boundary
            data.max()   # Maximum value
        ]
        bounds = sorted(list(set([round(b, 2) for b in bounds])))  # Remove duplicates and round

        # Create colormap with more distinct colors
        colormap = cm.StepColormap(
            colors=['#313695', '#4575b4', '#74add1', '#abd9e9', '#fdae61', '#f46d43', '#d73027'],
            vmin=bounds[0],
            vmax=bounds[-1],
            index=bounds,
            caption=(f'Min: {bounds[0]:.2f} | '
                    f'Q1: {Q1:.2f} | '
                    f'Median: {data.median():.2f} | '
                    f'Q3: {Q3:.2f} | '
                    f'Max: {bounds[-1]:.2f}\n'
                    f'(IQR: {IQR:.2f})')
        )
        return colormap
    else:
        # For categorical data, create a categorical color map
        unique_values = data.value_counts()
        if len(unique_values) <= 20:
            top_categories = unique_values
        else:
            top_categories = unique_values.head(20)

        distinct_colors = [
            '#e6194B', '#3cb44b', '#4363d8', '#f58231', '#911eb4',
            '#42d4f4', '#f032e6', '#bfef45', '#fabed4', '#469990',
            '#dcbeff', '#9A6324', '#fffac8', '#800000', '#aaffc3',
            '#808000', '#ffd8b1', '#000075', '#a9a9a9', '#000000'
        ]

        color_dict = dict(zip(top_categories.index, distinct_colors[:len(top_categories)]))
        if len(unique_values) > 20:
            color_dict['Others'] = '#808080'

        return color_dict, top_categories

def update_map(column):
    # Create base map centered on mean coordinates
    df_cleaned = df.dropna(subset=[column])
    m = folium.Map(
        location=[df_cleaned['_টিউবওয়েলের/ নলকূপের অবস্থান_latitude'].mean(),
                 df_cleaned['_টিউবওয়েলের/ নলকূপের অবস্থান_longitude'].mean()],
        zoom_start=12,
        prefer_canvas=True
    )
    # Add GeoJSON layer
    style_function = lambda x: {
        'fillColor': 'none',
        'color': 'black',
        'weight': 1,
        'fillOpacity': 0
    }

    tooltip = folium.GeoJsonTooltip(
        fields=['tv_name'],
        aliases=['Village:'],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;")
    )

    geojson_layer = folium.GeoJson(
        geojson_data,
        name='Village Boundaries',
        style_function=style_function,
        tooltip=tooltip
    )
    geojson_layer.add_to(m)

    # Create color map
    if pd.api.types.is_numeric_dtype(df_cleaned[column]):
        color_map = create_color_map(df_cleaned[column])
        top_categories = None  # Not needed for numeric data
    else:
        color_map, top_categories = create_color_map(df_cleaned[column])

    # Add points to map
    for idx, row in df_cleaned.iterrows():
        if pd.api.types.is_numeric_dtype(df_cleaned[column]):
            color = color_map(row[column])
        else:
            # Use get() method with 'Others' as default for any category not in top 20
            color = color_map.get(row[column], '#808080')

        popup_text = f"UUID : {row['_uuid']}<br>{column}: {row[column]}"

        folium.CircleMarker(
            location=[row['_টিউবওয়েলের/ নলকূপের অবস্থান_latitude'],
                     row['_টিউবওয়েলের/ নলকূপের অবস্থান_longitude']],
            radius=5,
            popup=popup_text,
            color=color,
            fill=True
        ).add_to(m)

    # Add color map to the map if numeric
    if pd.api.types.is_numeric_dtype(df_cleaned[column]):
        # Create a custom legend for numeric data
        legend_html = f'''
            <div style="position: fixed; 
                        bottom: 50px; right: 50px; 
                        width: 200px; 
                        border:2px solid grey; 
                        z-index:9999; 
                        background-color:white;
                        padding:10px; 
                        font-size:12px;">
            <p style="font-size:14px"><b>{column}</b></p>
            '''
        
        # Add the numeric ranges to the legend
        for i in range(len(color_map.index)-1):
            start = color_map.index[i]
            end = color_map.index[i+1]
            color = rgba_to_hex(color_map.colors[i])
            legend_html += f'<p><span style="color:{color};">●</span> {start:.2f} - {end:.2f}</p>'
        
        legend_html += '</div>'
        m.get_root().html.add_child(folium.Element(legend_html))
        color_map.add_to(m)
    else:
        # Add legend for categorical data
        legend_html = f'''
            <div style="position: fixed;
                        bottom: 50px; right: 50px; width: 200px; max-height: 500px;
                        border:2px solid grey; z-index:9999; background-color:white;
                        padding:10px; overflow-y: auto; font-size:12px;">
            <p style="font-size:14px"><b>{column}</b></p>
            <p style="font-size:10px">Showing top 20 categories by frequency</p>
            '''

        # Add top categories with counts
        for value, count in top_categories.items():
            color = color_map[value]
            legend_html += f'<p><span style="color:{color};">●</span> {value} ({count})</p>'

        # Add 'Others' category if there are more categories
        if len(df_cleaned[column].unique()) > 20:
            others_count = df_cleaned[column].value_counts().iloc[20:].sum()
            legend_html += f'<p><span style="color:#808080;">●</span> Others ({others_count})</p>'

        legend_html += '</div>'
        m.get_root().html.add_child(folium.Element(legend_html))

    # Add layer control
    folium.LayerControl().add_to(m)

    display(m)

# Create dropdown widget
dropdown = widgets.Dropdown(
    options=plot_columns,
    description='Color by:',
    style={'description_width': 'initial'}
)

# Connect the dropdown to the function
widgets.interactive(update_map, column=dropdown)

interactive(children=(Dropdown(description='Color by:', options=('ব্লকের নাম', 'গ্রাম পঞ্চায়েতের নাম', 'প্রত্…

In [60]:
# Get numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create two dropdown widgets
dropdown1 = widgets.Dropdown(
    options=numeric_cols,
    description='X-axis:',
    style={'description_width': 'initial'}
)

dropdown2 = widgets.Dropdown(
    options=numeric_cols,
    description='Y-axis:',
    style={'description_width': 'initial'}
)

# Create an Output widget to display the plot
output = widgets.Output()

# Initialize the figure
fig = go.FigureWidget()
fig.update_layout(
    width=800,
    height=600,
    showlegend=True
)

def update_plot(change):
    x_col = dropdown1.value
    y_col = dropdown2.value

    with output:
        output.clear_output(wait=True)

        # Remove NaN values
        mask = ~(df[x_col].isna() | df[y_col].isna())
        df_clean = df[mask]

        # Calculate regression line
        slope, intercept, r_value, p_value, std_err = stats.linregress(df_clean[x_col], df_clean[y_col])

        # Create line of best fit points
        x_range = df_clean[x_col]
        y_range = slope * x_range + intercept

        # Update the figure data
        fig.data = []  # Clear existing traces

        # Add scatter points
        fig.add_trace(
            go.Scatter(
                x=df_clean[x_col],
                y=df_clean[y_col],
                mode='markers',
                name='Data Points',
                marker=dict(
                    size=8,
                    opacity=0.6,
                    color='blue'
                ),
                hovertemplate=
                f"UUID: %{{customdata}}<br>" +
                f"{x_col}: %{{x}}<br>" +
                f"{y_col}: %{{y}}<br>" +
                "<extra></extra>",  # This removes the secondary box
                customdata=df_clean['_uuid']
            )
        )

        # Add regression line
        fig.add_trace(
            go.Scatter(
                x=x_range,
                y=y_range,
                mode='lines',
                name=f'Regression Line (R² = {r_value**2:.3f})',
                line=dict(color='red'),
                hovertemplate=f"{x_col}: %{{x}}<br>{y_col}: %{{y}}<extra></extra>"
            )
        )

        # Update layout
        fig.update_layout(
            title=f'Scatter Plot: {x_col} vs {y_col}',
            xaxis_title=x_col,
            yaxis_title=y_col,
            hovermode='closest'
        )

        # Print regression statistics
        print(f"Regression Statistics:")
        print(f"Number of points: {len(df_clean)}")
        print(f"Slope: {slope:.3f}")
        print(f"Intercept: {intercept:.3f}")
        print(f"R-squared: {r_value**2:.3f}")
        print(f"P-value: {p_value:.3e}")

        display(fig)

# Connect the update function to both dropdowns
dropdown1.observe(update_plot, names='value')
dropdown2.observe(update_plot, names='value')

# Create the initial plot
update_plot(None)

# Display the widgets and output
display(widgets.VBox([dropdown1, dropdown2, output]))

VBox(children=(Dropdown(description='X-axis:', options=(' সংসদ নম্বর', '_টিউবওয়েলের/ নলকূপের অবস্থান_latitude…

In [54]:
import requests

url = 'https://kf.kobotoolbox.org/api/v2/assets/awnkrAQo57AG8PeDLFQSc8/data.json'
headers = {'Authorization': 'Token a8e01315f733f8f47972dd31be8a1d98c4f37eb2'}
response = requests.get(url, headers=headers)
data = response.json()
print(json.dumps(data, indent=4, ensure_ascii=False))  # Use ensure_ascii=False for non-ASCII characters


ConnectionError: HTTPSConnectionPool(host='kf.kobotoolbox.org', port=443): Max retries exceeded with url: /api/v2/assets/awnkrAQo57AG8PeDLFQSc8/data.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x10adec050>: Failed to resolve 'kf.kobotoolbox.org' ([Errno 8] nodename nor servname provided, or not known)"))