In [29]:
import pandas as pd
import scipy.stats as st
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
import numpy as np

In [30]:
all_products = pd.read_csv('all products - Sheet1.csv')

In [31]:
column_mapping = {col: col.replace(' ', '') for col in all_products.columns}

all_products = all_products.rename(columns=column_mapping)

**Null hypothesis ($H_0$):** positive taste comments ratings = negative taste comments rating 

**Alternative hypothesis ($H_1$):** positive taste comments ratings != negative taste comments rating 

In [32]:
good_taste = all_products[all_products['taste'] == 'good']
bad_taste = all_products[all_products['taste'] == 'bad']

In [33]:
st.ttest_ind(good_taste['rating'], bad_taste['rating'])

#p-value is lower than 0.05, so we can say there is significant difference between the ratings on comments about good taste and comments about bad taste


TtestResult(statistic=6.215097394604501, pvalue=1.2862619642825903e-08, df=97.0)

**Null hypothesis ($H_0$):** positive texture comments ratings = negative texture comments rating 

**Alternative hypothesis ($H_2$):** positive texture comments ratings != negative texture comments rating 

In [34]:
good_texture = all_products[all_products['texture'] == 'good']
bad_texture = all_products[all_products['texture'] == 'bad']

In [35]:
st.ttest_ind(good_texture['rating'], bad_texture['rating'])
#p-value is lower than 0.05, so we can say there is significan difference between the ratings on comments about good texture and comments about bad texture

TtestResult(statistic=4.585006939206189, pvalue=1.779967292941973e-05, df=75.0)

**Null hypothesis ($H_0$):** ratings of comments discussing similarity with the original product = ratings of comments that don't discuss the similarity with the original product

**Alternative hypothesis ($H_3$):** ratings of comments discussing similarity with the original product != ratings of comments that don't discuss the similarity with the original product

In [42]:
similarity_comments = all_products[all_products['similarity_with_original_meat_product'].notna()]
no_similarity_comments = all_products[all_products['similarity_with_original_meat_product'].isna()]

In [45]:
st.ttest_ind(similarity_comments['rating'], no_similarity_comments['rating'])

#The p-value is higher than 0.05, so we cannot reject the null hypothesis


TtestResult(statistic=1.3909407907243831, pvalue=0.1664983556401905, df=137.0)

**Null hypothesis ($H_0$):** ratings of comments discussing price = ratings of comments that doesn't discuss price

**Alternative hypothesis ($H_4$):** ratings of comments discussing price != ratings of comments that doesn't discuss price

In [46]:
price_comments = all_products[all_products['price'].notna()]
no_price_comments = all_products[all_products['price'].isna()]

In [47]:
st.ttest_ind(price_comments['rating'], no_price_comments['rating'])

#No significant difference between the ratings on comments that mention price and the comments that don't

TtestResult(statistic=0.19001447396591886, pvalue=0.8495790431849057, df=137.0)

**Null hypothesis ($H_0$):** ratings of comments with improvements insights on taste = ratings of comments with no improvements insights on taste

**Alternative hypothesis ($H_5$):** ratings of comments with improvements insights on taste != ratings of comments with no improvements insights on taste

In [49]:
comments_taste_improvements = all_products[all_products['insights_on_improvements_on_taste'] == True]
comments_without_taste_improvements = all_products[all_products['insights_on_improvements_on_taste'] == False]

In [50]:
mean_rating_taste_improvements = all_products[all_products['insights_on_improvements_on_taste'] == True].rating.mean()
mean_rating_without_taste_improvements = all_products[all_products['insights_on_improvements_on_taste'] == False].rating.mean()

print(mean_rating_taste_improvements)
print(mean_rating_without_taste_improvements)

3.5
4.362903225806452


In [51]:
st.ttest_ind(comments_taste_improvements['rating'], comments_without_taste_improvements['rating'])

#The comments with taste improvements have significant lower rating compared to the ones that don't have taste improvements

TtestResult(statistic=-4.5592150047599835, pvalue=1.1269576061859208e-05, df=137.0)

**Null hypothesis ($H_0$):** ratings of comments with improvements insights on texture = ratings of comments with no improvements insights on texture

**Alternative hypothesis ($H_6$):** ratings of comments with improvements insights on texture != ratings of comments with no improvements insights on texture

In [52]:
comments_texture_improvements = all_products[all_products['insights_on_improvement_on_texture'] == True]
comments_without_texture_improvements = all_products[all_products['insights_on_improvement_on_texture'] == False]

In [53]:
mean_rating_texture_improvements = all_products[all_products['insights_on_improvement_on_texture'] == True].rating.mean()
mean_rating_without_texture_improvements = all_products[all_products['insights_on_improvement_on_texture'] == False].rating.mean()

print(mean_rating_texture_improvements)
print(mean_rating_without_texture_improvements)

3.5833333333333335
4.371900826446281


In [54]:
st.ttest_ind(comments_texture_improvements['rating'], comments_without_texture_improvements['rating'])

#The comments with texture improvements have significant lower rating compared to the ones that don't have taste improvements

TtestResult(statistic=-4.501038296280433, pvalue=1.4317381840392164e-05, df=137.0)

**Null hypothesis ($H_0$):** chance of repurchase of good comments about taste = chance of repurchase of bad comments about taste

**Alternative hypothesis ($H_7$):** chance of repurchase of good comments about taste != chance of repurchase of bad comments about taste

In [78]:
contingency_table = pd.crosstab(all_products['chance_of_repurchase'], all_products['taste'], margins=True, margins_name="Total")

# Extract the values from the contingency table
a = contingency_table.loc[False, 'bad']
b = contingency_table.loc[False, 'good']
c = contingency_table.loc[True, 'bad']
d = contingency_table.loc[True, 'good']

# Output the values
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")

a: 2
b: 2
c: 0
d: 16


In [79]:
# Contigency table:

#Chance of repurchase               Taste  
#                               bad        good
# false                         a           b
# true                          c           d

# Replace these values with your actual data
a = 2 # Count in Category1 and Group1
b = 2  # Count in Category1 and Group2
c = 0  # Count in Category2 and Group1
d = 16 # Count in Category2 and Group2

# Create a 2x2 contingency table
contingency_table = [[a, b], [c, d]]

# Perform Fisher's Exact Test
odds_ratio, p_value = fisher_exact(contingency_table)

print("Odds Ratio:", odds_ratio)
print("P-Value:", p_value)

#The p-value is lower tan 0.05, so we can reject the null hypothesis. There is significant difference between 
# chance of repurchase of good comments about taste and bad comments about taste

Odds Ratio: inf
P-Value: 0.031578947368421054


**Null hypothesis ($H_0$):** chance of repurchase of good comments about texture = chance of repurchase of bad comments about texture

**Alternative hypothesis ($H8$):** chance of repurchase of good comments about texture != chance of repurchase of bad comments about texture

In [76]:
contingency_table = pd.crosstab(all_products['chance_of_repurchase'], all_products['texture'], margins=True, margins_name="Total")

# Extract the values from the contingency table
a = contingency_table.loc[False, 'bad']
b = contingency_table.loc[False, 'good']
c = contingency_table.loc[True, 'bad']
d = contingency_table.loc[True, 'good']

# Output the values
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")

a: 4
b: 1
c: 2
d: 13


In [77]:
# Contigency table:

#Chance of repurchase               Texture  
#                               bad        good
# false                         a           b
# true                          c           d

# Replace these values with your actual data
a = 4 # Count in Category1 and Group1
b = 1  # Count in Category1 and Group2
c = 2  # Count in Category2 and Group1
d = 23 # Count in Category2 and Group2

# Create a 2x2 contingency table
contingency_table = [[a, b], [c, d]]

# Perform Fisher's Exact Test
odds_ratio, p_value = fisher_exact(contingency_table)

print("Odds Ratio:", odds_ratio)
print("P-Value:", p_value)

#The p-value is lower than 0.05, so there is significant difference between chance of repurchase of good
# and bad comments about texture. 

Odds Ratio: 46.0
P-Value: 0.002568312913140499


**Null hypothesis ($H_0$):** ratings of comments that have a good chance of repurchase = ratings of comments that have a bad chance of repurchase

**Alternative hypothesis ($H9$):** ratings of comments that have a good chance of repurchase = ratings of comments that have a bad chance of repurchase

In [62]:
comments_good_chance_repurchase = all_products[all_products['chance_of_repurchase'] == True]
comments_bad_chance_repurchase = all_products[all_products['chance_of_repurchase'] == False]

In [63]:
mean_rating_good_chance_repurchase = all_products[all_products['chance_of_repurchase'] == True].rating.mean()
mean_rating_bad_chance_repurchase = all_products[all_products['chance_of_repurchase'] == False].rating.mean()

print(mean_rating_good_chance_repurchase)
print(mean_rating_bad_chance_repurchase)

4.5
2.9


In [64]:
st.ttest_ind(comments_good_chance_repurchase['rating'], comments_bad_chance_repurchase['rating'])

#Since the p-value is lower than 0.05, we can say that comments that mencioned a good chance 
# of repurchase have a significant higher rating that the ones that that mencioned a bad chance 
#of repurchase

TtestResult(statistic=6.227991553292183, pvalue=9.931363569046837e-07, df=28.0)

**Null hypothesis ($H_0$):** comments about similarity with the original product have no relation with the type of comment about taste

**Alternative hypothesis ($H10$):** comments about similarity with the original product have association with the type of comment about taste

In [74]:
contingency_table = pd.crosstab(all_products['similarity_with_original_meat_product'], all_products['taste'], margins=True, margins_name="Total")

# Extract the values from the contingency table
a = contingency_table.loc['high', 'bad']
b = contingency_table.loc['high', 'good']
c = contingency_table.loc['low', 'bad']
d = contingency_table.loc['low', 'good']

# Output the values
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")

a: 1
b: 33
c: 2
d: 2


In [71]:
# Contigency table:
# Similarity        Taste
#            bad           good            
# high         a             b
# low          c             d

# Replace these values with your actual data
a = 1 # Count in Category1 and Group1
b = 33  # Count in Category1 and Group2
c = 2  # Count in Category2 and Group1
d = 2 # Count in Category2 and Group2

# Create a 2x2 contingency table
contingency_table = [[a, b], [c, d]]

# Perform Fisher's Exact Test
odds_ratio, p_value = fisher_exact(contingency_table)

# Output the results
print("Odds Ratio:", odds_ratio)
print("P-Value:", p_value)

#Since the p-value is lower than 0.05, we can say that comments about similarity with the original product have 
# association with the type of comment about taste. 

Odds Ratio: 0.030303030303030304
P-Value: 0.024656235182550973


**Null hypothesis ($H_0$):** comments about similarity with the original product have no relation with the type of comment about texture

**Alternative hypothesis ($H11$):** comments about similarity with the original product have association with the type of comment about texture

In [85]:
contingency_table = pd.crosstab(all_products['similarity_with_original_meat_product'], all_products['texture'], margins=True, margins_name="Total")

# Extract the values from the contingency table
a = contingency_table.loc['high', 'bad']
b = contingency_table.loc['high', 'good']
c = contingency_table.loc['low', 'bad']
d = contingency_table.loc['low', 'good']

# Output the values
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")
print(f"d: {d}")

a: 0
b: 24
c: 2
d: 1


In [86]:
# Contigency table:
# Similarity        Texture
#            bad           good            
# high         a             b
# low          c             d

# Replace these values with your actual data
a = 0 # Count in Category1 and Group1
b = 24  # Count in Category1 and Group2
c = 2  # Count in Category2 and Group1
d = 1 # Count in Category2 and Group2

# Create a 2x2 contingency table
contingency_table = [[a, b], [c, d]]

# Perform Fisher's Exact Test
odds_ratio, p_value = fisher_exact(contingency_table)

# Output the results
print("Odds Ratio:", odds_ratio)
print("P-Value:", p_value)

#Since the p-value is lower than 0.05, we can say that comments about similarity with the original product have 
# association with the type of comment about texture.  

Odds Ratio: 0.0
P-Value: 0.008547008547008548
