## **Problem 1**

In [1]:
from scipy.stats import chi2_contingency
import numpy as np

# Data
data = np.array([
    [11, 13, 9],  # No relief
    [32, 28, 27], # Some relief
    [7, 9, 14]    # Total relief
])

#### Test the hypothesis, at a 5% level of significance, that the three cough remedies are equally effective.

In [6]:
# Perform Chi-square test
chi2_stat, p_value, df, expected = chi2_contingency(data)

# Print results
print(f"Chi-square statistic: {chi2_stat:.2f}")
print(f"Degrees of freedom: {df}")
print(f"p-value: {p_value:.2f}")
print(f"Expected frequencies under the null hypothesis: \n {expected}")

# Interpretation
if p_value < 0.05:
    print("Rejects the null hypothesis, suggesting a significant difference in effectiveness among the remedies.")
else:
    print("Fails to reject the null hypothesis, suggesting no significant difference in effectiveness among the remedies.")

Chi-square statistic: 3.81
Degrees of freedom: 4
p-value: 0.43
Expected frequencies under the null hypothesis: 
 [[11. 11. 11.]
 [29. 29. 29.]
 [10. 10. 10.]]
Fails to reject the null hypothesis, suggesting no significant difference in effectiveness among the remedies.


## **Problem 2**

In [29]:
from scipy.stats import pearsonr, norm

# Data
x = np.array([106.2, 106.3, 105.3, 106.1, 105.4, 106.3, 104.7, 105.4, 105.5, 105.1])
y = np.array([35.0, 37.2, 39.8, 35.8, 41.3, 40.7, 38.7, 40.2, 38.1, 41.6])

#### (a) Find the 95% confidence interval for 𝜌, the correlation coefficient between tensile strength and hardness.

In [31]:
# Calculate the sample correlation coefficient (r)
r = pearsonr(x, y)[0]

# Fisher Z-transformation of the correlation coefficient
Z = 0.5 * np.log((1 + r) / (1 - r))

# Standard error of Z
SE_Z = 1 / np.sqrt(len(x) - 3)

# 95% confidence interval for Z
Z_alpha_2 = norm.ppf(0.975)  # Z value for 95% CI
CI_Z_lower = Z - Z_alpha_2 * SE_Z
CI_Z_upper = Z + Z_alpha_2 * SE_Z

# Inverse Fisher Z-transformation to get 95% CI for ρ
CI_rho_lower = (np.exp(2 * CI_Z_lower) - 1) / (np.exp(2 * CI_Z_lower) + 1)
CI_rho_upper = (np.exp(2 * CI_Z_upper) - 1) / (np.exp(2 * CI_Z_upper) + 1)

# Print the 95% CI for ρ
print(f"95% confidence interval for ρ: ({CI_rho_lower:.4f}, {CI_rho_upper:.4f})")

95% confidence interval for ρ: (-0.8632, 0.1739)


#### (b) Can you conclude that 𝜌 < 0.3?

In [32]:
# Determine if ρ < 0.3 based on the CI
if CI_rho_upper < 0.3:
    print("Yes, can conclude ρ < 0.3 based on the confidence interval.")
else:
    print("No, cannot conclude ρ < 0.3 based on the confidence interval.")


Yes, can conclude ρ < 0.3 based on the confidence interval.


#### (c) Can you conclude that 𝜌 ≠ 0?

In [33]:
# Determine if ρ ≠ 0 based on the CI
if CI_rho_lower > 0 or CI_rho_upper < 0:
    print("Yes, can conclude ρ ≠ 0 based on the confidence interval.")
else:
    print("No, cannot definitively conclude ρ ≠ 0 based on the confidence interval.")


No, cannot definitively conclude ρ ≠ 0 based on the confidence interval.


## **Problem 3**

In [34]:
# Data
r = 0.86
mean_x = 126  # Average file size in Kbytes
std_x = 35    # Standard deviation of file size
mean_y = 0.04 # Average transmittance time in seconds
std_y = 0.01  # Standard deviation of transmittance time

#### (a) Compute the least squares regression line which describes how the time it takes to transmit a depends on the file size.

In [35]:
# Calculate slope (b)
b = r * (std_y / std_x)

# Calculate intercept (a)
a = mean_y - b * mean_x

# Print the regression line equation
print(f"Regression line: Y = {a} + {b}X")

Regression line: Y = 0.00904 + 0.00024571428571428574X


#### (b) Use 𝑟^2 to evaluate the goodness of fit. Is this a good model?

In [36]:
# Given the correlation coefficient (r)
r_squared = r**2

# Print the coefficient of determination
print(f"Coefficient of determination (r^2): {r_squared:.4f}")


Coefficient of determination (r^2): 0.7396


#### (c) Predict the time it will take to transmit 400 Kbytes file.

In [37]:
# File size to predict transmittance time for
x_predict = 400

# Calculate the predicted transmittance time
y_predict = a + b * x_predict

# Print the predicted time
print(f"Predicted transmittance time for a 400 Kbytes file: {Y_predict:.4f} seconds")


Predicted transmittance time for a 400 Kbytes file: 0.1073 seconds


## **Problem 4**

In [38]:
# Data
temperature = np.array([110, 110, 111, 111, 112, 112, 114, 114, 117, 117, 122, 122, 130, 130, 143, 143])
stirring_rate = np.array([30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60])
yield_percent = np.array([70.27, 72.29, 72.57, 74.69, 76.09, 73.14, 75.61, 69.56, 74.41, 73.49, 79.18, 75.44, 81.71, 83.03, 76.98, 80.99])


#### (a) Compute the correlation coefficient between temperature and yield, between stirring rate and yield, and between temperature and stirring rate.

In [42]:
# Calculate correlation coefficients
corr_temp_yield = pearsonr(temperature, yield_percent)[0]
corr_stir_yield = pearsonr(stirring_rate, yield_percent)[0]
corr_temp_stir = pearsonr(temperature, stirring_rate)[0]

print(f"Correlation between temperature and yield: {corr_temp_yield:.4f}")
print(f"Correlation between stirring rate and yield: {corr_stir_yield:.4f}")
print(f"Correlation between temperature and stirring rate: {corr_temp_stir:.4f}")

Correlation between temperature and yield: 0.7323
Correlation between stirring rate and yield: 0.7513
Correlation between temperature and stirring rate: 0.9064


#### (b) Do these data provide a good evidence that increasing the temperature causes the yield to increase, within the range of the data? Or might the result be due to confounding? Explain.

In [43]:
# Interpret the effect of temperature on yield
if corr_temp_yield > 0.7:
    print("There's a strong positive correlation between temperature and yield, suggesting that as temperature increases, yield also tends to increase. This could indicate that increasing the temperature causes the yield to increase within the data range.")
else:
    print("The correlation between temperature and yield is not strong enough to conclusively determine a direct causal relationship.")

# Evaluate potential confounding with stirring rate
if corr_temp_stir > 0.7:
    print("However, the high correlation between temperature and stirring rate (0.9064) indicates potential confounding. This means it's difficult to determine whether the increase in yield is due solely to temperature without considering the influence of stirring rate.")


There's a strong positive correlation between temperature and yield, suggesting that as temperature increases, yield also tends to increase. This could indicate that increasing the temperature causes the yield to increase within the data range.
However, the high correlation between temperature and stirring rate (0.9064) indicates potential confounding. This means it's difficult to determine whether the increase in yield is due solely to temperature without considering the influence of stirring rate.


#### (c) Do these data provide a good evidence that increasing the stirring rate causes the yield to increase, within the range of the data? Or might the result be due to confounding? Explain

In [44]:
# Interpret the effect of stirring rate on yield
if corr_stir_yield > 0.7:
    print("There's a strong positive correlation between stirring rate and yield, suggesting that as stirring rate increases, yield also tends to increase. This could indicate that increasing the stirring rate causes the yield to increase within the data range.")
else:
    print("The correlation between stirring rate and yield is not strong enough to conclusively determine a direct causal relationship.")

# Evaluate potential confounding with temperature
if corr_temp_stir > 0.7:
    print("However, considering the high correlation between temperature and stirring rate, it's challenging to isolate the effect of stirring rate on yield without accounting for the potential confounding effect of temperature. This complexity suggests that both factors might be influencing yield, making it difficult to attribute changes in yield solely to changes in stirring rate.")


There's a strong positive correlation between stirring rate and yield, suggesting that as stirring rate increases, yield also tends to increase. This could indicate that increasing the stirring rate causes the yield to increase within the data range.
However, considering the high correlation between temperature and stirring rate, it's challenging to isolate the effect of stirring rate on yield without accounting for the potential confounding effect of temperature. This complexity suggests that both factors might be influencing yield, making it difficult to attribute changes in yield solely to changes in stirring rate.
