In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [5]:
def analyze_survey_data(csv_file_path):
    """
    Analyze survey data using Simple Random Sampling and Clustered Random Sampling
    
    Parameters:
    csv_file_path (str): Path to the CSV file containing survey data
    """
    # Load the data from CSV file
    try:
        data = pd.read_csv(csv_file_path)
        print(f"Successfully loaded data with {len(data)} records")
        print("First few rows:")
        print(data.head())
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return
    
    # Check if the data contains a column for survey responses and cluster IDs
    if 'response' not in data.columns:
        print("Warning: 'response' column not found. Using first numeric column as response data.")
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) == 0:
            print("Error: No numeric columns found in the dataset.")
            return
        response_col = numeric_cols[0]
        data['response'] = data[response_col]
        print(f"Using column '{response_col}' as response data.")
    
    if 'cluster_id' not in data.columns:
        print("Warning: 'cluster_id' column not found. Using first categorical column as cluster IDs.")
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) == 0:
            print("Note: No categorical columns found. Creating a default cluster column with 8 clusters.")
            data['cluster_id'] = np.random.randint(1, 9, size=len(data))
        else:
            cluster_col = categorical_cols[0]
            data['cluster_id'] = data[cluster_col]
            print(f"Using column '{cluster_col}' as cluster identifiers.")

In [6]:
    # 1. Simple Random Sampling (SRS)
    print("\n==== Simple Random Sampling Analysis ====")
    
    # Calculate mean for SRS
    srs_mean = data['response'].mean()
    print(f"1) Mean (SRS): {srs_mean:.2f}")
    
    # Calculate standard error for SRS
    n = len(data)
    srs_std = data['response'].std(ddof=1)  # Sample standard deviation
    srs_se = srs_std / np.sqrt(n)
    print(f"2) Standard Error (SRS): {srs_se:.4f}")
    
    # Calculate 95% confidence interval
    t_value = 2.04  # As specified in the assignment
    margin_of_error = t_value * srs_se
    ci_upper = srs_mean + margin_of_error
    ci_lower = srs_mean - margin_of_error
    print(f"3) 95% Confidence Interval (SRS):")
    print(f"   Upper limit: {ci_upper:.4f}")
    print(f"   Lower limit: {ci_lower:.4f}")


==== Simple Random Sampling Analysis ====


NameError: name 'data' is not defined

In [None]:
    # Calculate additional statistics
    d_squared = d_value ** 2
    print(f"4) d-squared: {d_squared:.4f}")
    
    n_avg = data.groupby('cluster_id').size().mean()
    roh = (d_squared - 1) / (n_avg - 1) if n_avg > 1 else 0
    print(f"5) Intraclass correlation (roh): {roh:.4f}")
    
    Neff = n / d_squared
    print(f"6) Effective sample size (Neff): {Neff:.4f}")
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    plt.bar(['SRS Mean', 'Clustered Mean'], [srs_mean, crs_mean])
    plt.errorbar(['SRS Mean', 'Clustered Mean'], [srs_mean, crs_mean], 
                 yerr=[srs_se, crs_se], fmt='o', color='black')
    plt.title('Comparison of SRS and Clustered Random Sampling')

SyntaxError: unterminated string literal (detected at line 17) (3356041533.py, line 17)

In [7]:
# Example usage with results display
results = analyze_survey_data('Question1_Final_CP.xlsx')
if results:
    print("\n==== Summary of Results (Rounded as Required) ====")
    for key, value in results.items():
        print(f"{key}: {value}")

Error loading CSV file: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte
