## Data Analysis 1: T-test

**Random Sample of Positive and Negative Reviews for T-test**

In [245]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

transformed_reviews_df = pd.read_csv("transformed_reviews.csv") # Read the csv file

# Acquire an equal number of positive and negative reviews based on number of negative reviews 
# as the number of negative reviews < number of positive reviews
neg_reviews = transformed_reviews_df[transformed_reviews_df['SentimentLabels'] == 'NEGATIVE']
pos_reviews = transformed_reviews_df[transformed_reviews_df['SentimentLabels'] == 'POSITIVE'].sample(n=neg_reviews.shape[0])

pos_numerator = pos_reviews['HelpfulnessNumerator']  # Extract the helpfulness numerator values
neg_numerator = neg_reviews['HelpfulnessNumerator']

#print(pos_reviews.shape)
#print(neg_reviews.shape)

**Normality Test**

In [246]:
# Perform normality test
pos_num_normal = stats.normaltest(pos_numerator)
neg_num_normal = stats.normaltest(neg_numerator)
print("Positive reviews normality test statistic:", pos_num_normal.statistic)
print("Positive reviews normality test p-value:", pos_num_normal.pvalue)
print("Negative reviews normality test statistic:", neg_num_normal.statistic)
print("Negative reviews normality test p-value:", neg_num_normal.pvalue)
print("\n")

# Apply log transformation
# pos_numerator_log = np.log(pos_numerator + 1)  # Adding 1 to avoid division by zero
# neg_numerator_log = np.log(neg_numerator + 1)

# Apply cbrt transformation
pos_numerator_log = np.cbrt(pos_numerator)
neg_numerator_log = np.cbrt(neg_numerator)

# Perform normality test again
pos_num_normal = stats.normaltest(pos_numerator_log)
neg_num_normal = stats.normaltest(neg_numerator_log)
print("Positive reviews normality test statistic after log transform:", pos_num_normal.statistic)
print("Positive reviews normality test p-value after log transform:", pos_num_normal.pvalue)
print("Negative reviews normality test statistic after log transform:", neg_num_normal.statistic)
print("Negative reviews normality test p-value after log transform:", neg_num_normal.pvalue)


Positive reviews normality test statistic: 265.22817072074014
Positive reviews normality test p-value: 2.549379594474762e-58
Negative reviews normality test statistic: 225.06914590583173
Negative reviews normality test p-value: 1.3392323863178282e-49


Positive reviews normality test statistic after log transform: 260.5436468932853
Positive reviews normality test p-value after log transform: 2.652564357093661e-57
Negative reviews normality test statistic after log transform: 506.88766778531544
Negative reviews normality test p-value after log transform: 8.52592421560746e-111


**Equal Variance Test**

In [247]:
# Perform the equal variance test
ev_test = stats.levene(pos_numerator, neg_numerator)
print("Levene's test statistic:", ev_test.statistic)
print("Levene's test p-value:", ev_test.pvalue)
print("\n")

# Perform the equal variance test (w/ log transformation)
ev_test_log = stats.levene(pos_numerator_log, neg_numerator_log)
print("Levene's test statistic:", ev_test_log.statistic)
print("Levene's test p-value:", ev_test_log.pvalue)

Levene's test statistic: 0.10097051477186361
Levene's test p-value: 0.7506866252805784


Levene's test statistic: 0.389436873853599
Levene's test p-value: 0.5326353597681985


**Perform the T-test**

In [248]:
t_stat, p_val = stats.ttest_ind(pos_numerator, neg_numerator)
print(pos_numerator.mean(), pos_numerator.std()) # Positive numerator mean and std
print(neg_numerator.mean(), neg_numerator.std()) # Negative numerator mean and std
print("t_stat:", t_stat)
print("p_val:", p_val)

1.951928451648966 1.2035243377543343
1.9390721073225266 1.2165798273608066
t_stat: 0.3177585793835685
p_val: 0.7506866252807513


**Perform U-test**

In [249]:
u_stat, p_val = stats.mannwhitneyu(pos_numerator, neg_numerator)
print(pos_numerator.mean(), pos_numerator.std()) # Positive numerator mean and std
print(neg_numerator.mean(), neg_numerator.std()) # Negative numerator mean and std
print("u_stat:", u_stat)
print("p_val:", p_val)

1.951928451648966 1.2035243377543343
1.9390721073225266 1.2165798273608066
u_stat: 1627772.5
p_val: 0.33314657800496394
