# 3. write a python class representing a discreate random variable with methods to calculate its expected value and variance

In [3]:
class DiscreteRandomVariable:  
    def __init__(self, values, probabilities):  
       
        if len(values) != len(probabilities):  
            raise ValueError("Values and probabilities must have the same length.")  
        if not (0.999 < sum(probabilities) < 1.001):  # Allow for a small floating-point error  
            raise ValueError("Probabilities must sum to 1.")  
          
        self.values = values  
        self.probabilities = probabilities  
  
    def expected_value(self):  
        
        return sum(v * p for v, p in zip(self.values, self.probabilities))  
  
    def variance(self):  
        
        mean = self.expected_value()  
        return sum(p * (v - mean) ** 2 for v, p in zip(self.values, self.probabilities))  
  
  
# Example usage  
values = [1, 2, 3, 4, 5]  
probabilities = [0.1, 0.2, 0.3, 0.2, 0.2]  
  
drv = DiscreteRandomVariable(values, probabilities)  
print("Expected Value:", drv.expected_value())  
print("Variance:", drv.variance())  


Expected Value: 3.2
Variance: 1.56


# 4. Implement a program to simulate the rolling of a six-sided die and calculate the expected value and variance of the outcomes

In [17]:
import random  

class DiscreteRandomVariable:  
    def __init__(self, values, probabilities):  
       
        if len(values) != len(probabilities):  
            raise ValueError("Values and probabilities must have the same length.")  
        if not (0.999 < sum(probabilities) < 1.001):  # Allow for a small floating-point error  
            raise ValueError("Probabilities must sum to 1.")  
          
        self.values = values  
        self.probabilities = probabilities  
  
    def expected_value(self):  
       
        return sum(v * p for v, p in zip(self.values, self.probabilities))  
  
    def variance(self):  
       
        mean = self.expected_value()  
        return sum(p * (v - mean) ** 2 for v, p in zip(self.values, self.probabilities))  
  
  
def simulate_die_rolls(num_rolls):  
   
    outcomes = [random.randint(1, 6) for _ in range(num_rolls)]  
    return outcomes  
  
def calculate_empirical_probabilities(outcomes):  
   
    num_rolls = len(outcomes)  
    probabilities = [outcomes.count(i) / num_rolls for i in range(1, 7)]  
    return probabilities  
  
# Simulate rolling a six-sided die 1000 times  
num_rolls = 10 
outcomes = simulate_die_rolls(num_rolls)
 
# Calculate empirical probabilities  
values = [1, 2, 3, 4, 5, 6]  
probabilities = calculate_empirical_probabilities(outcomes)
  
# Create a DiscreteRandomVariable object  
drv = DiscreteRandomVariable(values, probabilities)  
  
# Calculate and print the expected value and variance  
print("Expected Value:", drv.expected_value())  
print("Variance:", drv.variance())  


Expected Value: 4.2
Variance: 3.16


# 5. Create a Python function to generate random samples 6rom a given probabilities distribution (e.g.,binomial, Poisson) and calculate their mean and variance.

In [19]:
import numpy as np  
  
def generate_samples_and_calculate_stats(distribution, params, num_samples):  
    
    if distribution == 'binomial':  
        n, p = params  
        samples = np.random.binomial(n, p, num_samples)  
    elif distribution == 'poisson':  
        lambda_ = params[0]  
        samples = np.random.poisson(lambda_, num_samples)  
    else:  
        raise ValueError("Unsupported distribution. Use 'binomial' or 'poisson'.")  
  
    mean = np.mean(samples)  
    variance = np.var(samples)  
      
    return samples, mean, variance  
  
binomial_samples, binomial_mean, binomial_variance = generate_samples_and_calculate_stats('binomial', (10, 0.5), 1000)  
print("Binomial Distribution:")  
print("Samples:", binomial_samples[:10])  # Print first 10 samples for brevity  
print("Mean:", binomial_mean)  
print("Variance:", binomial_variance)  
  

poisson_samples, poisson_mean, poisson_variance = generate_samples_and_calculate_stats('poisson', (3,), 1000)  
print("\nPoisson Distribution:")  
print("Samples:", poisson_samples[:10])  # Print first 10 samples for brevity  
print("Mean:", poisson_mean)  
print("Variance:", poisson_variance)  


Binomial Distribution:
Samples: [4 3 4 8 6 6 6 5 4 6]
Mean: 5.028
Variance: 2.535216

Poisson Distribution:
Samples: [1 0 2 4 2 2 1 4 3 3]
Mean: 2.984
Variance: 3.155744


# 6. Write a Python script to generate random numbers from a Gaussian (normal) distribution and compute the mean, variance, and standard deviation of the samples.

In [20]:
import numpy as np  
  
def generate_normal_samples(mean, std_dev, num_samples):  
    
    # Generate random samples from a normal distribution  
    samples = np.random.normal(mean, std_dev, num_samples)  
      
    # Calculate statistical measures  
    calculated_mean = np.mean(samples)  
    calculated_variance = np.var(samples)  
    calculated_std_dev = np.std(samples)  
      
    return samples, calculated_mean, calculated_variance, calculated_std_dev  
  
# Example usage:  
# Generate and analyze 1000 samples from a normal distribution with mean=0 and std_dev=1  
mean = 0  
std_dev = 1  
num_samples = 1000  
  
samples, calculated_mean, calculated_variance, calculated_std_dev = generate_normal_samples(mean, std_dev, num_samples)  
  
print("Normal Distribution:")  
print("Samples:", samples[:10])  # Print first 10 samples for brevity  
print("Calculated Mean:", calculated_mean)  
print("Calculated Variance:", calculated_variance)  
print("Calculated Standard Deviation:", calculated_std_dev)  


Normal Distribution:
Samples: [-0.69191998 -0.42690079 -0.82700805  1.59205447  2.16374766 -0.15289059
  0.7440166  -1.01817353 -0.18605312  0.60167988]
Calculated Mean: 0.0205451282624916
Calculated Variance: 1.0698043991992354
Calculated Standard Deviation: 1.0343134917418584


# 8. Write a Python function to calculate the probability density function (PDF) of a continuous random variable for a given normal distribution.

In [21]:
import math  
  
def normal_pdf(x, mu, sigma):  
    
    coefficient = 1 / (sigma * math.sqrt(2 * math.pi))  
    exponent = -((x - mu) ** 2) / (2 * sigma ** 2)  
    return coefficient * math.exp(exponent)  
  
# Example usage:  
x = 1.0  
mu = 0.0  
sigma = 1.0  
pdf_value = normal_pdf(x, mu, sigma)  
print(f"The PDF of the normal distribution at x={x} is {pdf_value}")  


The PDF of the normal distribution at x=1.0 is 0.24197072451914337


# 9. Create a program to calculate the cumulative distribution function (CDF) of exponential distribution.

In [22]:
import math  
  
def exponential_cdf(x, lambd):  
   
    if x < 0:  
        return 0.0  
    return 1 - math.exp(-lambd * x)  
  
# Example usage:  
x = 2.0  
lambd = 1.5  
cdf_value = exponential_cdf(x, lambd)  
print(f"The CDF of the exponential distribution at x={x} with lambda={lambd} is {cdf_value}")  


The CDF of the exponential distribution at x=2.0 with lambda=1.5 is 0.950212931632136


# 10. Write a Python function to calculate the probability mass function (PMF) of Poisson distribution.

In [23]:
import math  
  
def poisson_pmf(k, lambd):    
    if k < 0:  
        return 0.0  
    return (lambd ** k) * math.exp(-lambd) / math.factorial(k)  
  
# Example usage:  
k = 3  
lambd = 2.5  
pmf_value = poisson_pmf(k, lambd)  
print(f"The PMF of the Poisson distribution at k={k} with lambda={lambd} is {pmf_value}")  


The PMF of the Poisson distribution at k=3 with lambda=2.5 is 0.21376301724973645


# 11. A company wants to test in a new website layout leads to a higher conversion rate (percentage oN visitors who make a purchase). They collect data Nrom the old and new layouts to compare

In [24]:
import numpy as np  
from statsmodels.stats.proportion import proportions_ztest  
  
# Generate data  
old_layout = np.array([1] * 50 + [0] * 950)  
new_layout = np.array([1] * 70 + [0] * 930)  
  
# Calculate the number of successes (purchases) and the number of trials (visitors) for both layouts  
successes = np.array([old_layout.sum(), new_layout.sum()])  
nobs = np.array([len(old_layout), len(new_layout)])  
  
# Perform the z-test  
stat, p_value = proportions_ztest(successes, nobs, alternative='larger')  
  
print(f"Z-statistic: {stat}")  
print(f"P-value: {p_value}")  
  
# Interpretation  
alpha = 0.05  # significance level  
if p_value < alpha:  
    print("Reject the null hypothesis: The new layout leads to a higher conversion rate.")  
else:  
    print("Fail to reject the null hypothesis: There is no significant difference in conversion rates.")  


Z-statistic: -1.883108942886774
P-value: 0.9701571972337869
Fail to reject the null hypothesis: There is no significant difference in conversion rates.


# 12. A tutoring service claims that its program improves students' exam scores. A sample of students who participated in the program was taken, and their scores before and after the program were recorded

In [25]:
import numpy as np  
from scipy.stats import norm  
  
# Given data  
before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])  
after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])  
  
# Calculate the differences  
differences = after_program - before_program  
  
# Calculate mean and standard deviation of the differences  
mean_diff = np.mean(differences)  
std_diff = np.std(differences, ddof=1)  # Use ddof=1 for sample standard deviation  
n = len(differences)  
  
# Calculate the z-score  
z_score = mean_diff / (std_diff / np.sqrt(n))  
  
# Calculate the p-value  
p_value = 2 * (1 - norm.cdf(abs(z_score)))  # Two-tailed test  
  
# Output the results  
print("Mean of differences:", mean_diff)  
print("Standard deviation of differences:", std_diff)  
print("Z-score:", z_score)  
print("P-value:", p_value)  
  
# Decision based on p-value  
alpha = 0.05  
if p_value < alpha:  
    print("Reject the null hypothesis: The tutoring program has a significant effect on exam scores.")  
else:  
    print("Fail to reject the null hypothesis: There is no significant effect of the tutoring program on exam scores.")  


Mean of differences: 3.8
Standard deviation of differences: 2.616188916046478
Z-score: 4.593190894944668
P-value: 4.365194105293568e-06
Reject the null hypothesis: The tutoring program has a significant effect on exam scores.


# 13. A pharmaceutical company wants to determine if a new drug is effective in reducing blood pressure. They conduct a study and record blood pressure measurements before and after administering the drug.

In [26]:
import numpy as np  
from scipy.stats import norm  
  
# Given data  
before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])  
after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])  
  
# Calculate the differences  
differences = after_drug - before_drug  
  
# Calculate mean and standard deviation of the differences  
mean_diff = np.mean(differences)  
std_diff = np.std(differences, ddof=1)  # Use ddof=1 for sample standard deviation  
n = len(differences)  
  
# Calculate the z-score  
z_score = mean_diff / (std_diff / np.sqrt(n))  
  
# Calculate the p-value  
p_value = 2 * (1 - norm.cdf(abs(z_score)))  # Two-tailed test  
  
# Output the results  
print("Mean of differences:", mean_diff)  
print("Standard deviation of differences:", std_diff)  
print("Z-score:", z_score)  
print("P-value:", p_value)  
  
# Decision based on p-value  
alpha = 0.05  
if p_value < alpha:  
    print("Reject the null hypothesis: The drug has a significant effect on reducing blood pressure.")  
else:  
    print("Fail to reject the null hypothesis: There is no significant effect of the drug on blood pressure.")  


Mean of differences: -10.1
Standard deviation of differences: 3.178049716414141
Z-score: -10.049875621120888
P-value: 0.0
Reject the null hypothesis: The drug has a significant effect on reducing blood pressure.


# 14. A customer service department claims that their average response time is less than 5 minutes. A sample of recent customer interactions was taken, and the response times were recorded.

In [27]:
import numpy as np  
from scipy.stats import norm  
  
# Given data  
response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])  
  
# Population mean  
mu = 5  
  
# Calculate sample mean and standard deviation  
sample_mean = np.mean(response_times)  
sample_std = np.std(response_times, ddof=1)  # Use ddof=1 for sample standard deviation  
n = len(response_times)  
  
# Calculate the z-score  
z_score = (sample_mean - mu) / (sample_std / np.sqrt(n))  
  
# Calculate the p-value  
p_value = norm.cdf(z_score)  # One-tailed test  
  
# Output the results  
print("Sample mean:", sample_mean)  
print("Sample standard deviation:", sample_std)  
print("Z-score:", z_score)  
print("P-value:", p_value)  
  
# Decision based on p-value  
alpha = 0.05  
if p_value < alpha:  
    print("Reject the null hypothesis: The customer service department's average response time is less than 5 minutes.")  
else:  
    print("Fail to reject the null hypothesis: There is no significant evidence that the average response time is less than 5 minutes.")  


Sample mean: 4.57
Sample standard deviation: 0.4270050741306634
Z-score: -3.184457226042963
P-value: 0.0007251287113068958
Reject the null hypothesis: The customer service department's average response time is less than 5 minutes.


# 15. A company is testing two different website layouts to see which one leads to higher click-through rates. Write a Python function to perform an A/B test analysis, including calculating the t-statistic, degrees of freedom, and p-value.

In [28]:
import numpy as np  
from scipy.stats import t  
  
# Given data  
layout_a_clicks = [28, 32, 33, 29, 31, 34, 30, 35, 36, 37]  
layout_b_clicks = [40, 41, 38, 42, 39, 44, 43, 41, 45, 47]  
  
def ab_test_analysis(layout_a, layout_b):  
    # Convert lists to numpy arrays for easier calculations  
    layout_a = np.array(layout_a)  
    layout_b = np.array(layout_b)  
      
    # Calculate means  
    mean_a = np.mean(layout_a)  
    mean_b = np.mean(layout_b)  
      
    # Calculate standard deviations  
    std_a = np.std(layout_a, ddof=1)  # Use ddof=1 for sample standard deviation  
    std_b = np.std(layout_b, ddof=1)  
      
    # Calculate sample sizes  
    n_a = len(layout_a)  
    n_b = len(layout_b)  
      
    # Calculate the t-statistic  
    t_stat = (mean_a - mean_b) / np.sqrt((std_a**2 / n_a) + (std_b**2 / n_b))  
      
    # Calculate degrees of freedom  
    df = ((std_a**2 / n_a) + (std_b**2 / n_b))**2 / (((std_a**2 / n_a)**2 / (n_a - 1)) + ((std_b**2 / n_b)**2 / (n_b - 1)))  
      
    # Calculate the p-value (two-tailed test)  
    p_value = 2 * (1 - t.cdf(abs(t_stat), df))  
      
    return t_stat, df, p_value  
  
# Perform the A/B test analysis  
t_stat, df, p_value = ab_test_analysis(layout_a_clicks, layout_b_clicks)  
  
# Output the results  
print("T-statistic:", t_stat)  
print("Degrees of freedom:", df)  
print("P-value:", p_value)  
  
# Decision based on p-value  
alpha = 0.05  
if p_value < alpha:  
    print("Reject the null hypothesis: There is a significant difference in click-through rates between the two layouts.")  
else:  
    print("Fail to reject the null hypothesis: There is no significant difference in click-through rates between the two layouts.")  


T-statistic: -7.298102156175071
Degrees of freedom: 17.879871863320876
P-value: 9.19659607134804e-07
Reject the null hypothesis: There is a significant difference in click-through rates between the two layouts.


# 16. A pharmaceutical company wants to determine if a new drug is more effective than an existing drug in reducing cholesterol levels. Create a program to analyze the clinical trial data and calculate the t-statistic and p-value for the treatment effect.

In [29]:
import numpy as np  
from scipy.stats import t  
  
# Given data  
existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]  
new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]  
  
def analyze_clinical_trial(existing_levels, new_levels):  
    # Convert lists to numpy arrays for easier calculations  
    existing_levels = np.array(existing_levels)  
    new_levels = np.array(new_levels)  
      
    # Calculate means  
    mean_existing = np.mean(existing_levels)  
    mean_new = np.mean(new_levels)  
      
    # Calculate standard deviations  
    std_existing = np.std(existing_levels, ddof=1)  # Use ddof=1 for sample standard deviation  
    std_new = np.std(new_levels, ddof=1)  
      
    # Calculate sample sizes  
    n_existing = len(existing_levels)  
    n_new = len(new_levels)  
      
    # Calculate the t-statistic  
    t_stat = (mean_existing - mean_new) / np.sqrt((std_existing**2 / n_existing) + (std_new**2 / n_new))  
      
    # Calculate degrees of freedom  
    df = ((std_existing**2 / n_existing) + (std_new**2 / n_new))**2 / (((std_existing**2 / n_existing)**2 / (n_existing - 1)) + ((std_new**2 / n_new)**2 / (n_new - 1)))  
      
    # Calculate the p-value (one-tailed test)  
    p_value = 1 - t.cdf(t_stat, df)  
      
    return t_stat, df, p_value  
  
# Perform the clinical trial analysis  
t_stat, df, p_value = analyze_clinical_trial(existing_drug_levels, new_drug_levels)  
  
# Output the results  
print("T-statistic:", t_stat)  
print("Degrees of freedom:", df)  
print("P-value:", p_value)  
  
# Decision based on p-value  
alpha = 0.05  
if p_value < alpha:  
    print("Reject the null hypothesis: The new drug is more effective in reducing cholesterol levels than the existing drug.")  
else:  
    print("Fail to reject the null hypothesis: There is no significant evidence that the new drug is more effective in reducing cholesterol levels than the existing drug.")  


T-statistic: 4.140480986208661
Degrees of freedom: 17.866770765582338
P-value: 0.0003114614472734534
Reject the null hypothesis: The new drug is more effective in reducing cholesterol levels than the existing drug.


# 17. A school district introduces an educational intervention program to improve math scores. Write a Python function to analyze pre- and post-intervention test scores, calculating the t-statistic and p-value to determine if the intervention had a significant impact.

In [32]:
from scipy import stats  
  
def analyze_intervention(pre_scores, post_scores):  
    # Check if the lengths of the score lists are equal  
    if len(pre_scores) != len(post_scores):  
        raise ValueError("The lengths of pre-intervention and post-intervention scores must be the same.")  
      
    # Perform paired t-test  
    t_statistic, p_value = stats.ttest_rel(pre_scores, post_scores)  
      
    return t_statistic, p_value  
  
# Data  
pre_intervention_scores = [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]  
post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]  
  
# Analyze the intervention  
t_stat, p_val = analyze_intervention(pre_intervention_scores, post_intervention_scores)  
  
print(f"T-statistic: {t_stat}")  
print(f"P-value: {p_val}")  
  
# Determine significance  
alpha = 0.05  
if p_val < alpha:  
    print("The intervention had a significant impact on the test scores.")  
else:  
    print("The intervention did not have a significant impact on the test scores.")  


T-statistic: -4.42840883965761
P-value: 0.0016509548165795493
The intervention had a significant impact on the test scores.


# 18. An HR department wants to investigate if there's a gender-based salary gap within the company. Develop a program to analyze salary data, calculate the t-statistic, and determine if there's a statistically significant difference between the average salaries of male and female employees.

In [33]:
import numpy as np  
from scipy import stats  
  
# Generate synthetic salary data for male and female employees  
np.random.seed(0)  # For reproducibility  
male_salaries = np.random.normal(loc=50000, scale=10000, size=20)  
female_salaries = np.random.normal(loc=55000, scale=9000, size=20)  
  
# Calculate the means and standard deviations  
mean_male = np.mean(male_salaries)  
mean_female = np.mean(female_salaries)  
std_male = np.std(male_salaries, ddof=1)  
std_female = np.std(female_salaries, ddof=1)  
  
print(f"Mean salary for males: ${mean_male:.2f}")  
print(f"Mean salary for females: ${mean_female:.2f}")  
print(f"Standard deviation for male salaries: ${std_male:.2f}")  
print(f"Standard deviation for female salaries: ${std_female:.2f}")  
  
# Perform a two-sample t-test  
t_statistic, p_value = stats.ttest_ind(male_salaries, female_salaries)  
  
print(f"T-statistic: {t_statistic:.2f}")  
print(f"P-value: {p_value:.4f}")  
  
# Determine if there is a statistically significant difference  
alpha = 0.05  
if p_value < alpha:  
    print("There is a statistically significant difference between male and female salaries.")  
else:  
    print("There is no statistically significant difference between male and female salaries.")  


Mean salary for males: $55693.35
Mean salary for females: $55501.75
Standard deviation for male salaries: $8722.69
Standard deviation for female salaries: $10968.10
T-statistic: 0.06
P-value: 0.9516
There is no statistically significant difference between male and female salaries.


# 19. A manufacturer produces two different versions o a product and wants to compare their quality scores. Create a Python function to analyze quality assessment data, calculate the t-statistic, and decide whether there's a significant difference in quality between the two versions.

In [31]:
from scipy import stats  
  
def analyze_quality(version1_scores, version2_scores):  
    # Perform independent two-sample t-test  
    t_statistic, p_value = stats.ttest_ind(version1_scores, version2_scores)  
      
    return t_statistic, p_value  
  
# Data  
version1_scores = [85, 88, 82, 89, 87, 84, 90, 88, 85, 86, 91, 83, 87, 84, 89, 86, 84, 88, 85, 86, 89, 90, 87, 88, 85]  
version2_scores = [80, 78, 83, 81, 79, 82, 76, 80, 78, 81, 77, 82, 80, 79, 82, 79, 80, 81, 79, 82, 79, 78, 80, 81, 82]  
  
# Analyze the quality scores  
t_stat, p_val = analyze_quality(version1_scores, version2_scores)  
  
print(f"T-statistic: {t_stat}")  
print(f"P-value: {p_val}")  
  
# Determine significance  
alpha = 0.05  
if p_val < alpha:  
    print("There is a significant difference in quality between the two versions.")  
else:  
    print("There is no significant difference in quality between the two versions.")  


T-statistic: 11.325830417646698
P-value: 3.6824250702873965e-15
There is a significant difference in quality between the two versions.


# 20. A restaurant chain collects customer satisfaction scores for two different branches. Write a program to analyze the scores, calculate the t-statistic, and determine if there's a statistically significant difference in customer satisfaction between the branches

In [34]:
import numpy as np  
from scipy import stats  
  
# Customer satisfaction scores  
branch_a_scores = [4, 5, 3, 4, 5, 4, 5, 3, 4, 4, 5, 4, 4, 3, 4, 5, 5, 4, 3, 4, 5, 4, 3, 5, 4, 4, 5, 3, 4, 5, 4]  
branch_b_scores = [3, 4, 2, 3, 4, 3, 4, 2, 3, 3, 4, 3, 3, 2, 3, 4, 4, 3, 2, 3, 4, 3, 2, 4, 3, 3, 4, 2, 3, 4, 3]  
  
# Convert lists to numpy arrays for easier calculations  
branch_a_scores = np.array(branch_a_scores)  
branch_b_scores = np.array(branch_b_scores)  
  
# Calculate the means and standard deviations  
mean_a = np.mean(branch_a_scores)  
mean_b = np.mean(branch_b_scores)  
std_a = np.std(branch_a_scores, ddof=1)  
std_b = np.std(branch_b_scores, ddof=1)  
  
print(f"Mean satisfaction score for Branch A: {mean_a:.2f}")  
print(f"Mean satisfaction score for Branch B: {mean_b:.2f}")  
print(f"Standard deviation for Branch A scores: {std_a:.2f}")  
print(f"Standard deviation for Branch B scores: {std_b:.2f}")  
  
# Perform a two-sample t-test  
t_statistic, p_value = stats.ttest_ind(branch_a_scores, branch_b_scores)  
  
print(f"T-statistic: {t_statistic:.2f}")  
print(f"P-value: {p_value:.4f}")  
  
# Determine if there is a statistically significant difference  
alpha = 0.05  
if p_value < alpha:  
    print("There is a statistically significant difference in customer satisfaction between the branches.")  
else:  
    print("There is no statistically significant difference in customer satisfaction between the branches.")  


Mean satisfaction score for Branch A: 4.13
Mean satisfaction score for Branch B: 3.13
Standard deviation for Branch A scores: 0.72
Standard deviation for Branch B scores: 0.72
T-statistic: 5.48
P-value: 0.0000
There is a statistically significant difference in customer satisfaction between the branches.


# 21. A political analyst wants to determine if there is a significant association between age groups and voter preferences. (Candidate A or Candidate B). They collect data from a sample of 500 voters and classify them into different age groups and candidate preferences. Perform a Chi-Square test to determine if there is a significant association between age groups and voter preferences.

In [35]:
import numpy as np  
import pandas as pd  
from scipy.stats import chi2_contingency  
  
# Generate data  
np.random.seed(0)  
age_groups = np.random.choice(['18-30', '31-50', '51+'], size=500)  
voter_preferences = np.random.choice(['Candidate A', 'Candidate B'], size=500)  
  
# Create a contingency table  
data = pd.DataFrame({'Age Group': age_groups, 'Voter Preference': voter_preferences})  
contingency_table = pd.crosstab(data['Age Group'], data['Voter Preference'])  
  
print("Contingency Table:")  
print(contingency_table)  
  
# Perform the Chi-Square test  
chi2, p, dof, expected = chi2_contingency(contingency_table)  
  
print(f"\nChi-Square Statistic: {chi2:.2f}")  
print(f"P-value: {p:.4f}")  
print(f"Degrees of Freedom: {dof}")  
  
# Determine if there is a statistically significant association  
alpha = 0.05  
if p < alpha:  
    print("There is a statistically significant association between age groups and voter preferences.")  
else:  
    print("There is no statistically significant association between age groups and voter preferences.")  


Contingency Table:
Voter Preference  Candidate A  Candidate B
Age Group                                 
18-30                      95           87
31-50                      87           82
51+                        84           65

Chi-Square Statistic: 0.88
P-value: 0.6447
Degrees of Freedom: 2
There is no statistically significant association between age groups and voter preferences.


# 22. A company conducted a customer satisfaction survey to determine if there is a significant relationship between product satisfaction levels (Satisfied, Neutral, Dissatisfied) and the region where customers are located (East, West, North, South). The survey data is summarized in a contingency table. Conduct a ChiSquare test to determine if there is a significant relationship between product satisfaction levels and customer regions.

In [36]:
import numpy as np  
from scipy.stats import chi2_contingency  
  
# Sample data: Product satisfaction levels (rows) vs. Customer regions (columns)  
data = np.array([[50, 30, 40, 20], [30, 40, 30, 50], [20, 30, 40, 30]])  
  
# Display the contingency table  
print("Contingency Table:")  
print(data)  
  
# Perform the Chi-Square test  
chi2, p, dof, expected = chi2_contingency(data)  
  
print(f"\nChi-Square Statistic: {chi2:.2f}")  
print(f"P-value: {p:.4f}")  
print(f"Degrees of Freedom: {dof}")  
print("\nExpected Frequencies:")  
print(expected)  
  
# Determine if there is a statistically significant relationship  
alpha = 0.05  
if p < alpha:  
    print("There is a statistically significant relationship between product satisfaction levels and customer regions.")  
else:  
    print("There is no statistically significant relationship between product satisfaction levels and customer regions.")  


Contingency Table:
[[50 30 40 20]
 [30 40 30 50]
 [20 30 40 30]]

Chi-Square Statistic: 27.78
P-value: 0.0001
Degrees of Freedom: 6

Expected Frequencies:
[[34.14634146 34.14634146 37.56097561 34.14634146]
 [36.58536585 36.58536585 40.24390244 36.58536585]
 [29.26829268 29.26829268 32.19512195 29.26829268]]
There is a statistically significant relationship between product satisfaction levels and customer regions.


# 23. A company implemented an employee training program to improve job performance (Effective, Neutral, Ineffective). After the training, they collected data from a sample of employees and classified them based on their job performance before and after the training. Perform a Chi-Square test to determine if there is a significant difference between job performance levels before and after the training.

In [37]:
import numpy as np  
from scipy.stats import chi2_contingency  
  
# Sample data: Job performance levels before (rows) and after (columns) training  
data = np.array([[50, 30, 20], [30, 40, 30], [20, 30, 40]])  
  
# Display the contingency table  
print("Contingency Table:")  
print(data)  
  
# Perform the Chi-Square test  
chi2, p, dof, expected = chi2_contingency(data)  
  
print(f"\nChi-Square Statistic: {chi2:.2f}")  
print(f"P-value: {p:.4f}")  
print(f"Degrees of Freedom: {dof}")  
print("\nExpected Frequencies:")  
print(expected)  
  
# Determine if there is a statistically significant difference  
alpha = 0.05  
if p < alpha:  
    print("There is a statistically significant difference between job performance levels before and after the training.")  
else:  
    print("There is no statistically significant difference between job performance levels before and after the training.")  


Contingency Table:
[[50 30 20]
 [30 40 30]
 [20 30 40]]

Chi-Square Statistic: 22.16
P-value: 0.0002
Degrees of Freedom: 4

Expected Frequencies:
[[34.48275862 34.48275862 31.03448276]
 [34.48275862 34.48275862 31.03448276]
 [31.03448276 31.03448276 27.93103448]]
There is a statistically significant difference between job performance levels before and after the training.
