---

# V-Med Pro Analytics Demo Pipeline (Self-Contained Streamlit Version)
File Name: analytics_app.py


-----

In [None]:
%%writefile analytics_app.py

import pandas as pd
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr, ttest_ind
import warnings

# Suppress warnings for a clean demo
warnings.filterwarnings("ignore")
st.set_page_config(page_title="V-Med Pro Analytics Demo", layout="wide")
st.title("VIZITECH‚ÄìASU Analytics Demo Pipeline üß†")
st.markdown("### Automated Data Cleaning ‚Üí Descriptive ‚Üí Hypothesis ‚Üí Predictive Flow")

---

# 1. DATA GENERATION (Replaced Load)

----

In [None]:
def generate_synthetic_data(n_rows=100):
    """Generates synthetic data mimicking survey responses."""
    np.random.seed(42)

    data = {
        # General Survey Info
        'Institution_Type': np.random.choice(['University', 'Community College', 'Vocational School'], n_rows),
        'Region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
        # Numeric Feature Columns (scaled 1-5 or 0-1)
        'Q1_Infrastructure_Score': np.random.randint(1, 6, n_rows),
        'Q2_Usability_Score': np.random.randint(1, 6, n_rows),
        'Q3_Adoption_Experience': np.random.randint(1, 6, n_rows),
        'Q4_Budget_Allocation': np.random.uniform(0, 1, n_rows),
        'Q5_Training_Hours': np.random.uniform(10, 50, n_rows),
        # Hypothesis & Prediction Target Columns
        'Satisfaction': np.random.uniform(3, 5, n_rows),
        'Likelihood_to_Adopt': np.random.uniform(0, 1, n_rows),
        'Group': np.random.choice([0, 1], n_rows, p=[0.6, 0.4]) # 0=HEIs, 1=Professionals
    }

    df = pd.DataFrame(data)

    # Introduce some realistic missing data for the cleaning step to work
    for col in ['Q1_Infrastructure_Score', 'Q4_Budget_Allocation']:
        df.loc[df.sample(frac=0.05).index, col] = np.nan
    df.loc[df.sample(frac=0.03).index, 'Region'] = np.nan

    return df

st.subheader("üì• Step 1: Load/Generate Data")

try:
    # Use synthetic data instead of file path
    df = generate_synthetic_data(n_rows=200)
    st.write("**Synthetic Survey Data (first few rows):**")
    st.dataframe(df.head())
    st.info("üí° Data loaded from a synthetic generator for universal execution.")
except Exception as e:
    st.error(f"Error generating dataset: {e}")
    st.stop()


---

# 2. DATA CLEANING

----

In [None]:
st.subheader("üßπ Step 2: Data Cleaning")

st.markdown("**Handling Missing Values (Mean / Median / Mode Imputation)**")

missing_before = df.isna().sum().sum()

# Fill numeric with median, object with mode
for col in df.columns:
    if df[col].dtype == 'O':  # object/text
        if not df[col].mode().empty:
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna("Unknown", inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

missing_after = df.isna().sum().sum()

col1, col2 = st.columns(2)
col1.metric("Missing Values (Before)", missing_before)
col2.metric("Missing Values (After)", missing_after)

# Outlier handling (IQR method)
st.markdown("**Outlier Detection (IQR Method)**")
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)

# Encoding categorical variables
st.markdown("**Encoding Categorical Columns**")
label_enc = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    # Ensure all values are strings before encoding
    df[col] = label_enc.fit_transform(df[col].astype(str))

# Normalize numeric data
scaler = MinMaxScaler()
# Normalize the entire dataframe, including newly encoded columns
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

st.success("‚úÖ Data cleaned, encoded, and normalized successfully!")
st.dataframe(df_scaled.head())

---

# 3. DESCRIPTIVE ANALYTICS

----

In [None]:
st.subheader("üìä Step 3: Descriptive Analytics")

st.markdown("**Central Tendency & Dispersion Statistics**")
st.dataframe(df_scaled.describe().T)

#-----
# Visualizations
# -----
st.markdown("**Visualizations (ASU Maroon & Gold Theme)**")

# Pick top 5 numeric columns for demo plotting
numeric_cols_all = df_scaled.select_dtypes(include=np.number).columns.tolist()
# Filter out target/group columns for general feature visualization
feature_cols = [col for col in numeric_cols_all if col not in ['Satisfaction', 'Likelihood_to_Adopt', 'Group']][:5]

if feature_cols:
    # Create figure
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # Boxplot
    # Using the first two columns for demonstration colors
    sns.boxplot(data=df_scaled[feature_cols], ax=axes[0], palette=["#8C1D40", "#FFC627", "#8C1D40", "#FFC627", "#8C1D40"])
    axes[0].set_title("Boxplot: Outlier Spread (Top 5 Variables)", fontsize=10)
    axes[0].tick_params(axis='x', rotation=45)

    # Histogram
    df_scaled[feature_cols].plot(kind='hist', bins=10, alpha=0.7, ax=axes[1],
                                 legend=False, color="#8C1D40")
    axes[1].set_title("Histogram: Frequency Distribution", fontsize=10)
    axes[1].set_xlabel("Scaled Values")

    st.pyplot(fig)

    # Optional: Interactive single variable view
    st.markdown("**Interactive Variable Visualization**")
    selected_col = st.selectbox("Choose a numeric column to visualize:", feature_cols)

    fig, ax = plt.subplots(1, 2, figsize=(10, 4))
    sns.boxplot(y=df_scaled[selected_col], color="#FFC627", ax=ax[0])
    ax[0].set_title(f"Boxplot: {selected_col}")

    sns.histplot(df_scaled[selected_col], kde=True, color="#8C1D40", ax=ax[1])
    ax[1].set_title(f"Histogram: {selected_col}")

    st.pyplot(fig)
else:
    st.warning("Not enough numeric columns available for visualization after encoding.")

# Correlation heatmap
st.markdown("**Correlation Heatmap**")
fig, ax = plt.subplots(figsize=(8, 6))
# Ensure columns exist before calculating correlation
if not df_scaled.empty:
    sns.heatmap(df_scaled.corr(), annot=True, cmap="YlOrRd", fmt=".2f", ax=ax)
    ax.set_title("Feature Correlation Matrix")
    st.pyplot(fig)
else:
    st.warning("Cannot generate heatmap, scaled data is empty.")

----

# 4. HYPOTHESIS TESTING

----

In [None]:
st.subheader("üìà Step 4: Hypothesis Testing (Demo)")

st.markdown("""
**H‚ÇÅ:** Satisfaction ‚Üî Likelihood_to_Adopt (Spearman Correlation)  
**H‚ÇÇ:** HEIs vs Professionals (Independent t-Test on Adoption Likelihood)
""")

# Check for existence of required columns
required_cols = ['Satisfaction', 'Likelihood_to_Adopt', 'Group']
if all(col in df_scaled.columns for col in required_cols):
    # Spearman correlation
    corr, p_corr = spearmanr(df_scaled['Satisfaction'], df_scaled['Likelihood_to_Adopt'])
    st.write(f"**Spearman Correlation (H‚ÇÅ):** œÅ = {corr:.3f}, p = {p_corr:.3f}")

    # t-Test (HEI vs Professional)
    group0 = df_scaled[df_scaled['Group'] == 0]['Likelihood_to_Adopt']
    group1 = df_scaled[df_scaled['Group'] == 1]['Likelihood_to_Adopt']

    if len(group0) > 1 and len(group1) > 1:
        tstat, pval = ttest_ind(group0, group1)
        # Corrected f-string formatting
        st.write(f"**t-Test (H‚ÇÇ):** t = {tstat:.3f}, p = {pval:.3f}")

        if pval < 0.05:
            st.success("‚úÖ Statistically significant difference detected (p < 0.05).")
        else:
            st.warning("‚ö†Ô∏è No significant difference detected (p ‚â• 0.05).")
    else:
        st.warning("‚ö†Ô∏è Insufficient data in both groups (HEIs and Professionals) for t-Test.")
else:
    st.error(f"Hypothesis testing skipped. Required columns missing: {required_cols}")

----

# 5. PREDICTIVE MODEL (Demo)

----

In [None]:
st.subheader("ü§ñ Step 5: Predictive Modeling (Demo)")

# Define target and features
target_col = 'Likelihood_to_Adopt'

if target_col in df_scaled.columns and len(df_scaled) > 2:
    # Create binary target variable (High Adoption Likelihood vs Low)
    y = (df_scaled[target_col] > df_scaled[target_col].median()).astype(int)

    # Features: Drop the target and constant/near-constant columns if any were created during scaling
    X = df_scaled.drop(columns=[target_col], axis=1, errors='ignore')

    # Drop columns that are constant (standard for logistic regression)
    X = X.loc[:, (X != X.iloc[0]).any()]

    if X.shape[1] > 0 and len(X) >= 2:
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Train Logistic Regression Model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        # Predict probabilities on test set
        proba = model.predict_proba(X_test)[:, 1]

        # Display results
        st.write("**Predicted High Adoption Likelihood Probabilities (Test Set):**")
        prediction_df = pd.DataFrame({
            "Test_Sample_Index": X_test.index,
            "Predicted_Prob": np.round(proba, 3)
        }).set_index("Test_Sample_Index")
        st.dataframe(prediction_df.head(10))

        # Bar chart visualization
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.bar(prediction_df.index[:20], prediction_df['Predicted_Prob'][:20], color='#8C1D40')
        ax.set_title("Predicted Adoption Probability for First 20 Test Samples")
        ax.set_ylabel("Probability")
        ax.set_xlabel("Test Sample Index")
        st.pyplot(fig)
        st.success("‚úÖ Predictive model executed successfully!")
    else:
        st.warning("‚ö†Ô∏è Not enough unique features or data points to train the predictive model.")
else:
    st.warning("‚ö†Ô∏è Target column 'Likelihood_to_Adopt' is missing or dataset is too small to train the model.")

-----

# 6. PIPELINE SUMMARY

---

In [None]:
st.subheader("üöÄ End-to-End Pipeline Completed")
st.markdown("""
This demo shows an **automated, modular pipeline** covering:
- Data Generation/Loading ‚Üí Cleaning ‚Üí Descriptive ‚Üí Hypothesis ‚Üí Predictive  
- Designed for continuous integration of new data (HEIs + Professionals).  
- Fully scalable for Power BI / Airflow integration in production.
""")

st.balloons()

------

## Running on terminal

pip install streamlit

cd Downloads
streamlit run analytics_app.py

----