# Statistics_Q8-Q20

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import statistics as st
from scipy.stats import skew

# Q.8  Write a Python function to calculate the probability density function (PDF) of a continuous random variable for a given normal distribution.

In [2]:
import scipy.stats as stats

def normal_pdf(x, mu, sigma):
  """
  Calculates the probability density function (PDF) of a normal distribution.

  Args:
      x: The value of the random variable.
      mu: The mean of the normal distribution.
      sigma: The standard deviation of the normal distribution.

  Returns:
      The probability density function (PDF) value for the given x.
  """

  return stats.norm.pdf(x, loc=mu, scale=sigma)

# Example usage
mu = 50  # Mean of the normal distribution
sigma = 10  # Standard deviation of the normal distribution
x_value = 65  # Value for which to calculate PDF

pdf_value = normal_pdf(x_value, mu, sigma)
print("PDF value for x =", x_value, ":", pdf_value)


PDF value for x = 65 : 0.012951759566589175


# Q.9  Create a program to calculate the cumulative distribution function (CDF) of exponential distribution.

In [5]:

def exponential_cdf(x, lam):
  """
  Calculates the cumulative distribution function (CDF) of the exponential distribution.

  Args:
      x: The value of the random variable.
      lam: The rate parameter of the exponential distribution.

  Returns:
      The CDF value for the given x.
  """

  if x < 0:
    return 0  # CDF is 0 for negative values in exponential distribution
  else:
    return stats.expon.cdf(x, scale=1/lam)  # Scale parameter is 1/lambda

# Example usage
lam = 0.5  # Rate parameter of the exponential distribution
x_value = 2  # Value for which to calculate CDF

cdf_value = exponential_cdf(x_value, lam)
print("CDF value for x =", x_value, ":", cdf_value)


CDF value for x = 2 : 0.6321205588285577


# Q.10  Write a Python function to calculate the probability mass function (PMF) of Poisson distribution

In [6]:
def poisson_pmf(k, lam):
  """
  Calculates the probability mass function (PMF) of the Poisson distribution.

  Args:
      k: The number of events (non-negative integer).
      lam: The average rate of events (lambda).

  Returns:
      The PMF value for the given k.
  """

  # Handle non-negative integer input for k
  if not isinstance(k, int) or k < 0:
    raise ValueError("k must be a non-negative integer.")

  return stats.poisson.pmf(k, lam)

# Example usage
lam = 3  # Average rate of events (lambda)
k = 2  # Number of events

pmf_value = poisson_pmf(k, lam)
print("PMF value for k =", k, ":", pmf_value)


PMF value for k = 2 : 0.22404180765538775


# Q.11  A company wants to test if a new website layout leads to a higher conversion rate (percentage of visitors 
# who make a purchase). They collect data from the old and new layouts to compare.
# To generate the data use the following command:
 ```python
 import numpy as np
 # 50 purchases out of 1000 visitors
 old_layout = np.array([1] * 50 + [0] * 950)
 # 70 purchases out of 1000 visitors  
new_layout = np.array([1] * 70 + [0] * 930)
 ```
# Apply z-test to find which layout is successful.

In [7]:
def generate_purchase_data(conversion_rate, visitors):
  """
  Generates purchase data as an array with 1s for purchases and 0s for no purchases.

  Args:
      conversion_rate: The conversion rate (percentage of purchases / visitors).
      visitors: The total number of visitors.

  Returns:
      A NumPy array representing the purchase data.
  """

  num_purchases = int(conversion_rate * visitors)
  return np.array([1] * num_purchases + [0] * (visitors - num_purchases))

old_layout = generate_purchase_data(0.05, 1000)  # 5% conversion rate for old layout
new_layout = generate_purchase_data(0.07, 1000)  # 7% conversion rate for new layout

# Calculate conversion rates (optional, for clarity)
old_conversion_rate = np.mean(old_layout)
new_conversion_rate = np.mean(new_layout)
print("Old Layout Conversion Rate:", old_conversion_rate)
print("New Layout Conversion Rate:", new_conversion_rate)

# Z-test for proportions
prop_old = np.mean(old_layout)
prop_new = np.mean(new_layout)
n_old = len(old_layout)
n_new = len(new_layout)

pooled_prop = (prop_old * n_old + prop_new * n_new) / (n_old + n_new)
pooled_std = np.sqrt(pooled_prop * (1 - pooled_prop) * (1 / n_old + 1 / n_new))

z_statistic = (prop_new - prop_old) / pooled_std
print("Z-statistic:", z_statistic)

# Significance level (alpha) - adjust based on your desired level (commonly 0.05)
alpha = 0.05

# Calculate p-value using the standard normal cumulative distribution function (cdf)
p_value = 1 - stats.norm.cdf(z_statistic)

# Hypothesis testing
if p_value < alpha:
  print("Reject null hypothesis. New layout has a statistically significant higher conversion rate.")
else:
  print("Fail to reject null hypothesis. There is not enough evidence to conclude the new layout is better.")


Old Layout Conversion Rate: 0.05
New Layout Conversion Rate: 0.07
Z-statistic: 1.883108942886774
Reject null hypothesis. New layout has a statistically significant higher conversion rate.


# Q.12 A tutoring service claims that its program improves students' exam scores. A sample of students who 
# participated in the program was taken, and their scores before and after the program were recorded.
 Use the below code to generate samples of respective arrays of marks:
 ```python
 before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])
 after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])
 ```
 Use z-test to find if the claims made by tutor are true or false

In [9]:
# Sample exam scores (before and after program)
before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])
after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])

# Calculate means and standard deviations
mean_before = np.mean(before_program)
mean_after = np.mean(after_program)
std_before = np.std(before_program)
std_after = np.std(after_program)

# Sample sizes
n_before = len(before_program)
n_after = len(after_program)

# Pooled standard deviation
pooled_std = np.sqrt(((n_before - 1) * std_before**2 + (n_after - 1) * std_after**2) / (n_before + n_after - 2))

# Z-statistic
z_statistic = (mean_after - mean_before) / pooled_std

# Significance level (alpha)
alpha = 0.05  # Common significance level

# Calculate p-value using the standard normal cumulative distribution function (cdf)
p_value = 1 - stats.norm.cdf(z_statistic)

# Hypothesis testing
print("Z-statistic:", z_statistic)
print("p-value:", p_value)

if p_value < alpha:
  print("Reject null hypothesis. The program seems to have a statistically significant positive effect on exam scores.")
else:
  print("Fail to reject null hypothesis. There is not enough evidence to conclude the program improves scores.")


Z-statistic: 0.6411276714876429
p-value: 0.2607198685679841
Fail to reject null hypothesis. There is not enough evidence to conclude the program improves scores.


# Q.13  A pharmaceutical company wants to determine if a new drug is effective in reducing blood pressure. They 
# conduct a study and record blood pressure measurements before and after administering the drug.
 Use the below code to generate samples of respective arrays of blood pressure:
 ```python
 before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])
 after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])
 ```
 Implement z-test to find if the drug really works or not

In [10]:
# Sample blood pressure readings (before and after drug)
before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])
after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])

# Calculate means and standard deviations
mean_before = np.mean(before_drug)
mean_after = np.mean(after_drug)
std_before = np.std(before_drug)
std_after = np.std(after_drug)

# Sample sizes
n_before = len(before_drug)
n_after = len(after_drug)

# Pooled standard deviation
pooled_std = np.sqrt(((n_before - 1) * std_before**2 + (n_after - 1) * std_after**2) / (n_before + n_after - 2))

# Z-statistic
z_statistic = (mean_after - mean_before) / pooled_std

# Significance level (alpha)
alpha = 0.05  # Common significance level

# Calculate p-value using the standard normal cumulative distribution function (cdf)
p_value = 1 - stats.norm.cdf(z_statistic)

# Hypothesis testing
print("Z-statistic:", z_statistic)
print("p-value:", p_value)

if p_value < alpha:
  print("Reject null hypothesis. The drug seems to have a statistically significant effect in reducing blood pressure.")
else:
  print("Fail to reject null hypothesis. There is not enough evidence to conclude the drug is effective.")


Z-statistic: -1.2443085882494078
p-value: 0.893306994290019
Fail to reject null hypothesis. There is not enough evidence to conclude the drug is effective.


# Q.14  A customer service department claims that their average response time is less than 5 minutes. A sample 
# of recent customer interactions was taken, and the response times were recorded.
 Implement the below code to generate the array of response time:
 ```python
 response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])
 ```
 Implement z-test to find the claims made by customer service department are tru or false.

In [11]:
# Sample response times (in minutes)
response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])

# Average response time (claim to be less than 5 minutes)
mean_response_time = 5

# Sample size
n = len(response_times)

# Standard deviation (assuming it's unknown)
std_dev = np.std(response_times)

# Z-statistic for one-tailed test (less than)
z_statistic = (mean_response_time - np.mean(response_times)) / std_dev

# Significance level (alpha)
alpha = 0.05  # Common significance level

# Calculate p-value using the standard normal cumulative distribution function (cdf)
# Since it's a one-tailed test, calculate the probability for the upper tail
p_value = 1 - stats.norm.cdf(z_statistic)

# Hypothesis testing
print("Z-statistic:", z_statistic)
print("p-value:", p_value)

if p_value < alpha:
  print("Reject null hypothesis. There is evidence to suggest the average response time is less than 5 minutes.")
else:
  print("Fail to reject null hypothesis. There is not enough evidence to conclude the claim is true.")


Z-statistic: 1.061485742014321
p-value: 0.14423460458378923
Fail to reject null hypothesis. There is not enough evidence to conclude the claim is true.


# Q.15  A company is testing two different website layouts to see which one leads to higher click-through rates. 
# Write a Python function to perform an A/B test analysis, including calculating the t-statistic, degrees of 
# freedom, and p-value. Use the following data:
 ```python
 layout_a_clicks = [28, 32, 33, 29, 31, 34, 30, 35, 36, 37]
 layout_b_clicks = [40, 41, 38, 42, 39, 44, 43, 41, 45, 47]

In [12]:
def ab_test_analysis(data_a, data_b):
  """
  Performs A/B test analysis for click-through rates.

  Args:
      data_a: An array of click-through counts for layout A.
      data_b: An array of click-through counts for layout B.

  Returns:
      A dictionary containing the t-statistic, degrees of freedom, and p-value.
  """

  # Convert data to NumPy arrays if necessary
  data_a = np.asarray(data_a)
  data_b = np.asarray(data_b)

  # Check if data lengths are equal
  if len(data_a) != len(data_b):
    raise ValueError("Data arrays for layouts A and B must have the same length.")

  # Calculate means and standard deviations
  mean_a = np.mean(data_a)
  mean_b = np.mean(data_b)
  std_a = np.std(data_a)
  std_b = np.std(data_b)

  # Calculate pooled standard deviation
  pooled_std = np.sqrt(((len(data_a) - 1) * std_a**2 + (len(data_b) - 1) * std_b**2) / (len(data_a) + len(data_b) - 2))

  # Calculate degrees of freedom
  df = len(data_a) + len(data_b) - 2

  # Calculate t-statistic
  t_statistic = (mean_a - mean_b) / (pooled_std * np.sqrt(1 / len(data_a) + 1 / len(data_b)))

  # Calculate p-value (two-tailed test)
  p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df))  # Two-tailed test

  # Return results
  return {
      "t_statistic": t_statistic,
      "degrees_of_freedom": df,
      "p_value": p_value
  }

# Example usage (replace with your actual data if needed)
layout_a_clicks = [28, 32, 33, 29, 31, 34, 30, 35, 36, 37]
layout_b_clicks = [40, 41, 38, 42, 39, 44, 43, 41, 45, 47]

results = ab_test_analysis(layout_a_clicks, layout_b_clicks)
print("T-statistic:", results["t_statistic"])
print("Degrees of freedom:", results["degrees_of_freedom"])
print("p-value:", results["p_value"])


T-statistic: -7.692875136699702
Degrees of freedom: 18
p-value: 4.260288652968569e-07


# Q.16 A pharmaceutical company wants to determine if a new drug is more effective than an existing drug in 
# reducing cholesterol levels. Create a program to analyze the clinical trial data and calculate the t statistic and p-value for the treatment effect.
Use the following data of cholestrol level:
 ```python
 existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]
 new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]

In [13]:
# Existing drug cholesterol levels
existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]

# New drug cholesterol levels
new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]

# Convert data to NumPy arrays for efficient calculations
existing_drug_levels = np.asarray(existing_drug_levels)
new_drug_levels = np.asarray(new_drug_levels)

# Check if data lengths are equal (assumption for independent samples t-test)
if len(existing_drug_levels) != len(new_drug_levels):
  raise ValueError("Sample sizes for existing and new drug groups must be equal.")

# Calculate means and standard deviations
existing_mean = np.mean(existing_drug_levels)
new_mean = np.mean(new_drug_levels)
existing_std = np.std(existing_drug_levels)
new_std = np.std(new_drug_levels)

# Pooled standard deviation (assuming equal variances)
pooled_std = np.sqrt(((len(existing_drug_levels) - 1) * existing_std**2 + (len(new_drug_levels) - 1) * new_std**2) / (len(existing_drug_levels) + len(new_drug_levels) - 2))

# Degrees of freedom (assuming equal variances)
df = len(existing_drug_levels) + len(new_drug_levels) - 2

# T-statistic for independent samples (assuming equal variances)
t_statistic = (existing_mean - new_mean) / (pooled_std * np.sqrt(1 / len(existing_drug_levels) + 1 / len(new_drug_levels)))

# P-value (two-tailed test)
p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df))  # Two-tailed test

# Print results
print("T-statistic:", t_statistic)
print("Degrees of freedom:", df)
print("p-value:", p_value)

# Interpretation (optional)
if p_value < 0.05:  # Adjust significance level based on your needs
  print("Reject null hypothesis. The new drug seems to have a statistically significant effect in reducing cholesterol levels compared to the existing drug.")
else:
  print("Fail to reject null hypothesis. There is not enough evidence to conclude the new drug is definitively more effective.")


T-statistic: 4.364450175013196
Degrees of freedom: 18
p-value: 0.00037373609400592755
Reject null hypothesis. The new drug seems to have a statistically significant effect in reducing cholesterol levels compared to the existing drug.


# Q.17  A school district introduces an educational intervention program to improve math scores. Write a Python function to analyze pre- and post-intervention test scores, calculating the t-statistic and p-value to determine if the intervention had a significant impact.
 Use the following data of test score:
 ```python
 pre_intervention_scores = [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]
 post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]

In [15]:
def intervention_analysis(pre_scores, post_scores):
  """
  Analyzes pre- and post-intervention test scores to assess the intervention effect.

  Args:
      pre_scores: An array of pre-intervention test scores.
      post_scores: An array of post-intervention test scores.

  Returns:
      A dictionary containing the t-statistic, degrees of freedom, and p-value.
  """

  # Convert data to NumPy arrays if necessary
  pre_scores = np.asarray(pre_scores)
  post_scores = np.asarray(post_scores)

  # Check if data lengths are equal (paired samples assumption)
  if len(pre_scores) != len(post_scores):
    raise ValueError("Sample sizes for pre- and post-intervention scores must be equal.")

  # Calculate means and standard deviations
  pre_mean = np.mean(pre_scores)
  post_mean = np.mean(post_scores)
  pre_std = np.std(pre_scores)
  post_std = np.std(post_scores)

  # Paired samples t-statistic
  t_statistic = (post_mean - pre_mean) / (np.sqrt(0.5 * (pre_std**2 + post_std**2)))  # Assuming equal variances

  # Degrees of freedom (paired samples)
  df = len(pre_scores) - 1

  # P-value (two-tailed test)
  p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df))  # Two-tailed test

  # Return results
  return {
      "t_statistic": t_statistic,
      "degrees_of freedom": df,
      "p_value": p_value
  }

# Example usage (replace with your actual data if needed)
pre_intervention_scores = [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]
post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]

results = intervention_analysis(pre_intervention_scores, post_intervention_scores)
print("T-statistic:", results["t_statistic"])
print("Degrees of freedom:", results["degrees_of freedom"])
print("p-value:", results["p_value"])


T-statistic: 1.9234978538484915
Degrees of freedom: 9
p-value: 0.08657000537640602


# Q.18  A political analyst wants to determine if there is a significant association between age groups and voter preferences (Candidate A or Candidate B). They collect data from a sample of 500 voters and classify them into different age groups and candidate preferences. Perform a Chi-Square test to determine if there is a significant association between age groups and voter preferences.
 Use the below code to generate data:
 ```python
 np.random.seed(0)
 age_groups = np.random.choice(['18-30', '31-50', '51+', '51+'], size=30)
 voter_preferences = np.random.choice(['Candidate A', 'Candidate B'], size=30)

In [17]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

np.random.seed(0)  # For reproducibility

# Generate data (500 voters)
age_groups = np.random.choice(['18-30', '31-50', '51+'], size=500)
voter_preferences = np.random.choice(['Candidate A', 'Candidate B'], size=500)

# Create contingency table using pandas.crosstab
contingency_table = pd.crosstab(age_groups, voter_preferences)
print(contingency_table)  # Print contingency table for reference (optional)

# Perform Chi-Square test
chi2, pval, degrees_of_freedom, expected_table = chi2_contingency(contingency_table)

# Print test results
print("\nChi-Square statistic:", chi2)
print("p-value:", pval)
print("Degrees of freedom:", degrees_of_freedom)

# Interpretation (replace with your own based on significance level)
if pval < 0.05:
    print("There is a statistically significant association between age groups and voter preferences.")
else:
    print("We cannot conclude a statistically significant association based on this sample.")


col_0  Candidate A  Candidate B
row_0                          
18-30           95           87
31-50           87           82
51+             84           65

Chi-Square statistic: 0.8779923945254768
p-value: 0.6446832311860852
Degrees of freedom: 2
We cannot conclude a statistically significant association based on this sample.


# Q.19 A company conducted a customer satisfaction survey to determine if there is a significant relationship between product satisfaction levels (Satisfied, Neutral, Dissatisfied) and the region where customers are located (East, West, North, South). The survey data is summarized in a contingency table. Conduct a ChiSquare test to determine if there is a significant relationship between product satisfaction levels and customer regions.
 Sample data:
 ```python
 ```
 #Sample data: Product satisfaction levels (rows) vs. Customer regions (columns)
 data = np.array([[50, 30, 40, 20], [30, 40, 30, 50], [20, 30, 40, 30]])

In [18]:
import numpy as np
from scipy.stats import chi2_contingency

# Provided data (reshape if necessary)
data = np.array([[50, 30, 40, 20], [30, 40, 30, 50], [20, 30, 40, 30]])

# Perform Chi-Square test
chi2, pval, degrees_of_freedom, expected_table = chi2_contingency(data)

# Print test results
print("Chi-Square statistic:", chi2)
print("p-value:", pval)
print("Degrees of freedom:", degrees_of_freedom)

# Interpretation (replace with your own based on significance level)
if pval < 0.05:
    print("There is a statistically significant association between product satisfaction levels and customer regions.")
else:
    print("We cannot conclude a statistically significant association based on this sample.")


Chi-Square statistic: 27.777056277056275
p-value: 0.00010349448486004387
Degrees of freedom: 6
There is a statistically significant association between product satisfaction levels and customer regions.


# Q.20  A company produces three different versions of a product: Standard, Premium, and Deluxe. The company wants to determine if there is a significant difference in customer satisfaction scores among the three product versions. They conducted a survey and collected customer satisfaction scores for each version from a random sample of customers. Perform an ANOVA test to determine if there is a significant difference in customer satisfaction scores.
 Use the following data:
 ```python
 # Sample data: Customer satisfaction scores for each product version
 standard_scores = [80, 85, 90, 78, 88, 82, 92, 78, 85, 87]
 ```
 premium_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]
 deluxe_scores = [95, 98, 92, 97, 96, 94, 98, 97, 92, 99]