## Data Analysis 1: T-test

**Random Sample of Positive and Negative Reviews for T-test**

In [17]:
import pandas as pd
import numpy as np

transformed_reviews_df = pd.read_csv("transformed_reviews.csv") # Read the csv file

# Acquire an equal number of positive and negative reviews based on number of negative reviews 
# as the number of negative reviews < number of positive reviews
neg_reviews = transformed_reviews_df[transformed_reviews_df['SentimentLabels'] == 'NEGATIVE']
pos_reviews = transformed_reviews_df[transformed_reviews_df['SentimentLabels'] == 'POSITIVE'].sample(n=neg_reviews.shape[0])

**Normality Test**

In [18]:
from scipy import stats
import matplotlib.pyplot as plt

pos_numerator = pos_reviews['HelpfulnessNumerator']  # Extract the helpfulness numerator values
neg_numerator = neg_reviews['HelpfulnessNumerator']

# Perform normality test
pos_num_normal = stats.normaltest(pos_numerator)
neg_num_normal = stats.normaltest(neg_numerator)
print("Positive reviews normality test statistic:", pos_num_normal.statistic)
print("Positive reviews normality test p-value:", pos_num_normal.pvalue)
print("Negative reviews normality test statistic:", neg_num_normal.statistic)
print("Negative reviews normality test p-value:", neg_num_normal.pvalue)
print("\n")

# Apply log transformation
pos_numerator_log = np.log(pos_numerator + 1)  # Adding 1 to avoid division by zero
neg_numerator_log = np.log(neg_numerator + 1)

# Perform normality test again
pos_num_normal = stats.normaltest(pos_numerator_log)
neg_num_normal = stats.normaltest(neg_numerator_log)
print("Positive reviews normality test statistic after log transform:", pos_num_normal.statistic)
print("Positive reviews normality test p-value after log transform:", pos_num_normal.pvalue)
print("Negative reviews normality test statistic after log transform:", neg_num_normal.statistic)
print("Negative reviews normality test p-value after log transform:", neg_num_normal.pvalue)


Positive reviews normality test statistic: 259.95110824650385
Positive reviews normality test p-value: 3.567254232830481e-57
Negative reviews normality test statistic: 225.06914590583173
Negative reviews normality test p-value: 1.3392323863178282e-49


Positive reviews normality test statistic after log transform: 289.8967454588609
Positive reviews normality test p-value after log transform: 1.12129933025864e-63
Negative reviews normality test statistic after log transform: 558.4015432185131
Negative reviews normality test p-value after log transform: 5.554507365146226e-122


**Equal Variance Test**

In [19]:
# Perform the equal variance test
levene_test = stats.levene(pos_numerator, neg_numerator)
print("Levene's test statistic:", levene_test.statistic)
print("Levene's test p-value:", levene_test.pvalue)

Levene's test statistic: 0.13455138654200047
Levene's test p-value: 0.7137806098101553


**Perform the T-test**

In [20]:
# Perform the T-test
t_stat, p_val = stats.ttest_ind(pos_numerator, neg_numerator)
print(pos_numerator.mean(), pos_numerator.std()) # Positive numerator mean and std
print(neg_numerator.mean(), neg_numerator.std()) # Negative numerator mean and std
print("t_stat:", t_stat)
print("p_val:", p_val)

1.9619899385131359 1.1978297199226733
1.9390721073225266 1.2165798273608066
t_stat: 0.5677663491979474
p_val: 0.570229275891512
