In [None]:
# T-TEST and ANOVA Exercise - Jupyter Notebook

import pandas as pd
import numpy as np
from scipy import stats

# Load Data
feedback_df = pd.read_csv('/mnt/data/customer_feedback.csv')
sales_df = pd.read_csv('/mnt/data/sales_data.csv')

# Convert 'date' to datetime format
feedback_df['date'] = pd.to_datetime(feedback_df['date'])
sales_df['date'] = pd.to_datetime(sales_df['date'])

# Show basic info
print("Customer Feedback Data Shape:", feedback_df.shape)
print(feedback_df.head())
print(feedback_df.tail())

print("\nSales Data Shape:", sales_df.shape)
print(sales_df.head())
print(sales_df.tail())

# Function 1: Feedback Analysis
def feedback_analysis(df_feedback):
    # Separate scores by platform
    ios_scores = df_feedback[df_feedback['product'] == 'iOS']['feedback_score'].values
    android_scores = df_feedback[df_feedback['product'] == 'Android']['feedback_score'].values

    # Perform two-sample independent t-test
    statistic, p_val = stats.ttest_ind(ios_scores, android_scores, equal_var=False)
    
    print("Feedback analysis t-test statistic:", statistic)
    print("Feedback analysis pvalue:", p_val)
    
    return statistic, p_val

# Run feedback analysis
feedback_stat, feedback_p = feedback_analysis(feedback_df)

# Interpretation (Markdown Cell)
"""
### Interpretation: Feedback Analysis

The t-test compares feedback scores between iOS and Android users. 

- Test Statistic: {:.3f}
- P-value: {:.5f}

Since the p-value is greater than 0.05, we fail to reject the null hypothesis. This means we do **not** have sufficient evidence to say that average customer satisfaction is significantly different between iOS and Android apps.
""".format(feedback_stat, feedback_p)

# Function 2: Sales Analysis
def sales_analysis(df_sales):
    # Split data before and after March 2023 campaign
    before = df_sales[df_sales['date'] < '2023-03-01']['sales'].values
    after = df_sales[df_sales['date'] > '2023-03-31']['sales'].values

    # Perform independent t-test
    statistic, p_val = stats.ttest_ind(before, after, equal_var=False)

    print("Sales analysis t-test statistic:", statistic)
    print("Sales analysis pvalue:", p_val)

    return statistic, p_val

# Run sales analysis
sales_stat, sales_p = sales_analysis(sales_df)

# Interpretation (Markdown Cell)
"""
### Interpretation: Sales Analysis

The t-test compares sales figures before and after the marketing campaign in March 2023.

- Test Statistic: {:.3f}
- P-value: {:.5f}

Since the p-value is significantly greater than 0.05, we fail to reject the null hypothesis. There is **no statistically significant** evidence that the marketing campaign had a measurable impact on sales.
""".format(sales_stat, sales_p)

# Function 3: Seasonal Sales Analysis
def seasonal_analysis(df_sales):
    # Define summer and winter months
    summer_months = [6, 7, 8]
    winter_months = [12, 1, 2]

    summer_sales = df_sales[df_sales['date'].dt.month.isin(summer_months)]['sales'].values
    winter_sales = df_sales[df_sales['date'].dt.month.isin(winter_months)]['sales'].values

    # Perform independent t-test
    statistic, p_val = stats.ttest_ind(summer_sales, winter_sales, equal_var=False)

    print("Seasonal analysis t-test statistic:", statistic)
    print("Seasonal analysis pvalue:", p_val)

    return statistic, p_val

# Run seasonal analysis
seasonal_stat, seasonal_p = seasonal_analysis(sales_df)

# Interpretation (Markdown Cell)
"""
### Interpretation: Seasonal Sales Analysis

This t-test compares sales between the summer months (June-August) and winter months (December-February).

- Test Statistic: {:.3f}
- P-value: {:.5f}

Since the p-value is greater than 0.05, we fail to reject the null hypothesis. There is **no significant seasonal variation** in sales between summer and winter.
""".format(seasonal_stat, seasonal_p)

# Function 4: Feedback Consistency Analysis
def consistency_analysis(df_feedback):
    # Filter months of interest
    df_feedback['month'] = df_feedback['date'].dt.month
    selected_months = {1: 'Jan', 5: 'May', 9: 'Sep', 12: 'Dec'}
    filtered = df_feedback[df_feedback['month'].isin(selected_months.keys())]

    # Group feedback scores by month
    jan = filtered[filtered['month'] == 1]['feedback_score'].values
    may = filtered[filtered['month'] == 5]['feedback_score'].values
    sep = filtered[filtered['month'] == 9]['feedback_score'].values
    dec = filtered[filtered['month'] == 12]['feedback_score'].values

    # Perform one-way ANOVA
    statistic, p_val = stats.f_oneway(jan, may, sep, dec)

    print("Feedback consistency ANOVA statistic:", statistic)
    print("Feedback consistency pvalue:", p_val)

    return statistic, p_val

# Run consistency analysis
consistency_stat, consistency_p = consistency_analysis(feedback_df)

# Interpretation (Markdown Cell)
"""
### Interpretation: Feedback Consistency Analysis

The one-way ANOVA compares customer feedback scores across the months of January, May, September, and December.

- ANOVA Statistic: {:.3f}
- P-value: {:.5f}

Since the p-value is greater than 0.05, we conclude that there is **no statistically significant difference** in average feedback scores across these months.
""".format(consistency_stat, consistency_p)

# Function 5: Sales and Feedback Correlation Analysis
def corr_analysis(df_feedback, df_sales):
    # Aggregate feedback scores by date
    avg_feedback = df_feedback.groupby('date')['feedback_score'].mean().reset_index()
    avg_feedback.columns = ['date', 'avg_feedback_score']

    # Aggregate sales by date
    total_sales = df_sales.groupby('date')['sales'].sum().reset_index()

    # Merge both datasets on date
    merged = pd.merge(avg_feedback, total_sales, on='date')

    # Label high vs. low feedback
    threshold = merged['avg_feedback_score'].median()
    high_feedback = merged[merged['avg_feedback_score'] > threshold]['sales']
    low_feedback = merged[merged['avg_feedback_score'] <= threshold]['sales']

    # Perform t-test
    statistic, p_val = stats.ttest_ind(high_feedback, low_feedback, equal_var=False)

    print("Correlation analysis t-test statistic:", statistic)
    print("Correlation analysis pvalue:", p_val)

    return statistic, p_val

# Run correlation analysis
corr_stat, corr_p = corr_analysis(feedback_df, sales_df)

# Interpretation (Markdown Cell)
"""
### Interpretation: Sales and Feedback Correlation Analysis

This t-test evaluates whether higher average feedback scores are associated with significantly different sales performance.

- Test Statistic: {:.3f}
- P-value: {:.5f}

Since the p-value is greater than 0.05, we fail to reject the null hypothesis. There is **no significant evidence** that higher feedback scores correlate with higher sales.
""".format(corr_stat, corr_p)

#### Question-3 - Testing your code (5 pts)
**Write at least 5 test cases to test different parts of your implementation. We have provided an example. You will get full points as long as you have 5 test cases and a description of what is being tested. Please be sure to include at lease 2 test cases for perform_analyis method which should include at least one test case for checking exceptions. You are welcome to write your test cases using python unit test library. However, it is not required.**

In [None]:
#test that perform_analysis() handles empty data
my_project3 = Project(2, "Math", "University of Kansas", 109, [1, 2, 3])
my_project3.perform_analysis()
print("Test Case 7 Passed: No crash on empty data.")

Test Case 7 Passed: No crash on empty data.


### Question 6: Tests if active can be set (5 points) 


### Question 9: Tests if adding non-numerical values to data_points raises an error (3 pts)