In [None]:
import numpy as np
import pandas as pd

In [None]:
# Import modules for the project
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
import statsmodels.formula.api as smf

In [None]:
# Data Cleaning/Conditioning

# Read in the survey data from the class
df_survey = pd.read_csv(r'survey.csv')

# Survey dataframes for no music and music
df_no_music = df_survey[df_survey['Music'] == 'N']
df_music = df_survey[df_survey['Music'] == 'Y']

# Impute missing values for Opinion and Hungriness
def impute_opinion_demographics(df, demo_cols):
    df = df.copy()
    for name, group in df.groupby('Name'):
        missing_idx = group[group['Opinion'].isna()].index
        for idx in missing_idx:
            row = df.loc[idx]
            # Filter all other students in same condition (Music)
            candidates = df[
                (df['Music'] == row['Music']) &
                (df['Name'] != row['Name'])
            ]
            # Further filter by demographics
            for col in demo_cols:
                candidates = candidates[candidates[col] == row[col]]
            
            # Use observed Opinion values
            observed_op = candidates['Opinion'].dropna()
            if len(observed_op) > 0:
                df.loc[idx, 'Opinion'] = np.random.choice(observed_op)
            else:
                # Fallback: sample from all in same Music condition
                observed_op = df[df['Music'] == row['Music']]['Opinion'].dropna()
                df.loc[idx, 'Opinion'] = np.random.choice(observed_op)
            
            # Use observed Hungriness values
            observed_h = candidates['Hungriness'].dropna()
            if len(observed_h) > 0:
                df.loc[idx, 'Hungriness'] = np.random.choice(observed_h)
            else:
                observed_h = df[df['Music'] == row['Music']]['Hungriness'].dropna()
                df.loc[idx, 'Hungriness'] = np.random.choice(observed_h)
    
    return df

In [None]:
cols = ['Age', 'Gender', 'Major', 'Year']
df_survey_imp = impute_opinion_demographics(df_survey, cols)
df_survey_imp

In [None]:
# Matched Pairs T-Test 

# Convert the dataframe to wide format
df_wide = df_survey_imp.pivot(index = 'Name', columns = 'Music', values = 'Opinion').reset_index()
df_wide.rename(columns={'N':'Opinion_no', 'Y':'Opinion_yes'}, inplace=True)

# Calculate the differences with and without music
df_wide['D'] = df_wide['Opinion_yes'] - df_wide['Opinion_no']

# Display the differences in opinion
print(df_wide[['Name','Opinion_no','Opinion_yes','D']])

# Run a paired t-test
t_stat, p_value = ttest_rel(df_wide['Opinion_yes'], df_wide['Opinion_no'])
print(f"Paired t-test results:\nt = {t_stat:.3f}, p = {p_value:.3f}")

# Determine statistical significance
if p_value < 0.05:
    print("Significant difference in opinions with vs. without music (reject H0)")
else:
    print("No significant difference in opinions (fail to reject H0)")

In [None]:
# Create a mixed effects model
df_mixed = df_survey_imp.copy()

# Convert columns to binary and dummy encode
df_mixed["Music"] = df_mixed["Music"].map({"N":0,"Y":1})
df_mixed["Gender"] = df_mixed["Gender"].map({"M":0,"F":1})
df_mixed["Major"] = df_mixed["Major"].map({"N":0,"Y":1})
df_mixed = pd.get_dummies(df_mixed, columns=["Year"], drop_first=True)

# Drop the name and age columns
df_mixed = df_mixed.drop(['Age'], axis=1)

model = smf.mixedlm(
    "Opinion ~ Music + Gender + Major + Hungriness + Year_Sophomore + Year_Senior",
    df_mixed,
    groups=df_mixed["Name"]  # keep Name here!
)
result = model.fit()
print(result.summary())

In [None]:
# Confidence Intervals

In [None]:
# Correction Analysis

In [None]:
# Data Visualizations