# Answers

## A1 – Load Tips Dataset

_Import seaborn's `tips` dataset, calculate tip percentage (`tip_pct`), and display the first rows._

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd

sns.set_theme(style='whitegrid')

tips = sns.load_dataset('tips')
tips = tips.assign(tip_pct=tips['tip'] / tips['total_bill'])
tips.head()
# Observation: Tip percentages cluster around 15–20% across the previewed rows.


## A2 – Summary Statistics

_Generate descriptive statistics (count, mean, std, quartiles) for `total_bill`, `tip`, and `tip_pct`._

In [None]:
tips[['total_bill', 'tip', 'tip_pct']].describe().round(3)
# Observation: Tip percentage has a mean near 0.16 with modest dispersion.


## A3 – Bootstrap CI

_Estimate a 95% bootstrap confidence interval for the mean tip percentage using 5,000 resamples._

In [None]:
rng = np.random.default_rng(42)
boot_means = []
for _ in range(5000):
    sample = rng.choice(tips['tip_pct'], size=len(tips), replace=True)
    boot_means.append(sample.mean())
ci_lower, ci_upper = np.percentile(boot_means, [2.5, 97.5])
(ci_lower, ci_upper)
# Observation: The bootstrap interval typically brackets the 15–18% tipping range.


## A4 – Dinner vs Lunch T-Test

_Use Welch's t-test to compare mean tip percentage between dinner and lunch meals._

In [None]:
dinner_tip = tips.loc[tips['time'] == 'Dinner', 'tip_pct']
lunch_tip = tips.loc[tips['time'] == 'Lunch', 'tip_pct']
t_stat, p_value = stats.ttest_ind(dinner_tip, lunch_tip, equal_var=False)
print(f't-statistic: {t_stat:.3f}, p-value: {p_value:.4f}')
# Observation: The p-value usually falls below 0.05, indicating higher dinner tipping rates.


## A5 – Smoker Mann–Whitney

_Compare tip percentages for smokers vs non-smokers using the Mann–Whitney U test._

In [None]:
smoker_tip = tips.loc[tips['smoker'] == 'Yes', 'tip_pct']
non_smoker_tip = tips.loc[tips['smoker'] == 'No', 'tip_pct']
stat, p_value = stats.mannwhitneyu(smoker_tip, non_smoker_tip, alternative='two-sided')
print(f'U-statistic: {stat:.3f}, p-value: {p_value:.4f}')
# Observation: The non-parametric test suggests no dramatic difference between smoker groups.


## A6 – Smoker vs Time Chi-Square

_Construct a contingency table for `smoker` versus `time` and run a chi-square independence test._

In [None]:
contingency = pd.crosstab(tips['smoker'], tips['time'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f'Chi-square: {chi2:.2f}, p-value: {p_value:.4f}, dof: {dof}')
# Observation: Dependence between smoker status and meal timing is modest but noticeable.


## A7 – Pearson Correlation

_Compute the Pearson correlation and p-value between `total_bill` and `tip`._

In [None]:
corr_coef, p_value = stats.pearsonr(tips['total_bill'], tips['tip'])
print(f'Correlation: {corr_coef:.3f}, p-value: {p_value:.4e}')
# Observation: Tip size correlates strongly with bill size, and the relationship is highly significant.


## A8 – OLS Regression

_Fit an OLS regression predicting `tip` from `total_bill` and `size`, then print the model summary (abridged)._

In [None]:
ols_model = smf.ols('tip ~ total_bill + size', data=tips).fit()
ols_model.summary().tables[0]
# Observation: Both predictors register significant coefficients with respectable model fit.


## A9 – Regression Coefficients

_Display the estimated coefficients with 95% confidence intervals from the OLS model._

In [None]:
ols_model.conf_int(alpha=0.05).assign(coef=ols_model.params).rename(columns={0: 'lower', 1: 'upper'})
# Observation: Confidence intervals exclude zero, reinforcing the positive slopes.


## A10 – Residual Normality

_Apply the Shapiro–Wilk test to the OLS residuals to assess normality._

In [None]:
residuals = ols_model.resid
shapiro_stat, shapiro_p = stats.shapiro(residuals)
print(f'Shapiro-Wilk W: {shapiro_stat:.3f}, p-value: {shapiro_p:.4f}')
# Observation: Mild deviations from normality appear but rarely trigger strong significance.


## A11 – Cohen's d by Sex

_Compute Cohen's d effect size for tip percentage differences between male and female patrons._

In [None]:
male = tips.loc[tips['sex'] == 'Male', 'tip_pct']
female = tips.loc[tips['sex'] == 'Female', 'tip_pct']
pooled_std = np.sqrt(((len(male) - 1) * male.var() + (len(female) - 1) * female.var()) / (len(male) + len(female) - 2))
cohens_d = (male.mean() - female.mean()) / pooled_std
print(f"Cohen's d: {cohens_d:.3f}")
# Observation: The gender effect size on tipping is small in magnitude.


## A12 – Load Penguins Dataset

_Load seaborn's `penguins` dataset, drop rows missing key numeric fields, and preview the head._

In [None]:
penguins = sns.load_dataset('penguins')
penguins = penguins.dropna(subset=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species', 'sex'])
penguins.head()
# Observation: Cleaning removes a small number of rows while preserving all three species.


## A13 – Numeric Summary

_Generate descriptive statistics for the numeric penguin measurements._

In [None]:
penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].describe().round(2)
# Observation: Flipper lengths center around 201 mm with a 15 mm standard deviation.


## A14 – Body Mass Confidence Intervals

_Compute mean body mass with 95% confidence intervals for each species using standard error estimates._

In [None]:
body_mass_summary = (
    penguins.groupby('species')['body_mass_g']
    .agg(['mean', 'std', 'count'])
    .assign(sem=lambda df_: df_['std'] / np.sqrt(df_['count']))
)
body_mass_summary = body_mass_summary.assign(
    ci_lower=lambda df_: df_['mean'] - 1.96 * df_['sem'],
    ci_upper=lambda df_: df_['mean'] + 1.96 * df_['sem']
).round(1)
body_mass_summary[['mean', 'ci_lower', 'ci_upper']]
# Observation: Gentoo mass averages well above 5 kg with tight confidence bounds.


## A15 – Flipper Length ANOVA

_Run a one-way ANOVA to test whether mean flipper length differs across species._

In [None]:
groups = [group['flipper_length_mm'].values for _, group in penguins.groupby('species')]
stat, p_value = stats.f_oneway(*groups)
print(f'F-statistic: {stat:.2f}, p-value: {p_value:.4e}')
# Observation: The extremely small p-value confirms species-level differences in flipper length.


## A16 – Tukey HSD

_Perform Tukey's Honest Significant Difference test for flipper length across species._

In [None]:
tukey = pairwise_tukeyhsd(endog=penguins['flipper_length_mm'], groups=penguins['species'])
print(tukey)
# Observation: Every species pair shows significant flipper-length separation.


## A17 – Spearman Correlations

_Compute the Spearman correlation matrix for the penguin numeric features._

In [None]:
spearman_corr = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].corr(method='spearman').round(3)
spearman_corr
# Observation: Body mass tracks most strongly with flipper length under Spearman's rho.


## A18 – Regression on Body Mass

_Fit a linear model predicting `body_mass_g` from `flipper_length_mm`, `bill_depth_mm`, and species indicators using statsmodels._

In [None]:
lm_model = smf.ols('body_mass_g ~ flipper_length_mm + bill_depth_mm + C(species)', data=penguins).fit()
lm_model.summary().tables[1]
# Observation: Flipper length carries the largest positive coefficient even after controlling for species.


## A19 – Regression Diagnostics

_Apply the Shapiro–Wilk test to the residuals of the body mass regression model._

In [None]:
lm_residuals = lm_model.resid
lm_shapiro, lm_shapiro_p = stats.shapiro(lm_residuals)
print(f'Shapiro-Wilk W: {lm_shapiro:.3f}, p-value: {lm_shapiro_p:.4f}')
# Observation: Residuals are reasonably symmetric with mild departures from perfect normality.


## A20 – Logistic Model Setup

_Prepare a logistic regression dataset by encoding `sex` as 1 for male and 0 for female, creating dummy variables for species._

In [None]:
penguins_logit = penguins.copy()
penguins_logit['sex_male'] = (penguins_logit['sex'] == 'Male').astype(int)
X = pd.get_dummies(penguins_logit[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'species']], drop_first=True)
X = sm.add_constant(X)
y = penguins_logit['sex_male']
X.head()
# Observation: The design matrix now includes intercept, numeric predictors, and two species dummies.


## A21 – Fit Logistic Regression

_Fit a statsmodels Logit model predicting male penguins and report pseudo R-squared based on model deviance._

In [None]:
logit_model = sm.Logit(y, X).fit(disp=False)
pseudo_r2 = 1 - logit_model.deviance / logit_model.null_deviance
print(f'Pseudo R^2: {pseudo_r2:.3f}')
# Observation: Bill and flipper measurements alongside species provide strong separation between sexes.


## A22 – Logistic Confusion Matrix

_Generate predicted probabilities, classify using a 0.5 threshold, and display the confusion matrix with overall accuracy._

In [None]:
pred_probs = logit_model.predict(X)
pred_labels = (pred_probs >= 0.5).astype(int)
conf_matrix = pd.crosstab(y, pred_labels, rownames=['Actual'], colnames=['Predicted'])
accuracy = (pred_labels == y).mean()
print(conf_matrix)
print(f'Accuracy: {accuracy:.3f}')
# Observation: The classifier achieves high accuracy with few misclassifications at the 0.5 cutoff.
