In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind


In [38]:
df = pd.read_csv("data.csv")
df = df.dropna()

In [39]:
# convert the string column to a date object
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

In [40]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', ' Income ',
       'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines',
       'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Response', 'Complain', 'Country'],
      dtype='object')

In [41]:
# calculate the range of years in the date column
min_year = df['Dt_Customer'].dt.year.min()
max_year = df['Dt_Customer'].dt.year.max()
print(min_year,max_year)

2012 2014


In [42]:
# filter the DataFrame to include only rows with dates between 2013 and 2014
mask = (df['Dt_Customer'].dt.year >= 2013) & (df['Dt_Customer'].dt.year <= 2014)
filtered_df = df[mask]

In [43]:
# calculate the average amount spent on gold in the last 2 years
avg_gold_spend = filtered_df["MntGoldProds"].mean()
avg_gold_spend

41.33082271147161

In [44]:
# create a new column indicating whether each customer spent above or below the average amount on gold
filtered_df["above_avg_gold"] = filtered_df["MntGoldProds"] > avg_gold_spend
filtered_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["above_avg_gold"] = filtered_df["MntGoldProds"] > avg_gold_spend


Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country,above_avg_gold
0,1826,1970,Graduation,Divorced,"$84,835.00",0,0,2014-06-16,0,189,...,1,0,0,0,0,0,1,0,SP,True
1,1,1961,Graduation,Single,"$57,091.00",0,0,2014-06-15,0,464,...,5,0,0,0,0,1,1,0,CA,False
2,10476,1958,Graduation,Married,"$67,267.00",0,1,2014-05-13,0,134,...,2,0,0,0,0,0,0,0,US,False
3,1386,1967,Graduation,Together,"$32,474.00",1,1,2014-05-11,0,10,...,7,0,0,0,0,0,0,0,AUS,False
4,5371,1989,Graduation,Single,"$21,474.00",1,0,2014-04-08,0,6,...,7,1,0,0,0,0,1,0,SP,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2232,7232,1973,Graduation,Widow,"$42,429.00",0,1,2014-02-11,99,55,...,5,0,0,0,0,0,0,0,SP,False
2233,7829,1900,2n Cycle,Divorced,"$36,640.00",1,0,2013-09-26,99,15,...,5,0,0,0,0,0,0,1,IND,False
2234,9977,1973,Graduation,Divorced,"$78,901.00",0,1,2013-09-17,99,321,...,4,0,0,0,0,0,0,0,US,False
2235,10142,1976,PhD,Divorced,"$66,476.00",0,1,2013-03-07,99,372,...,4,0,0,0,0,0,0,0,US,True


In [9]:
# group the dataset by the above/below average gold spend and calculate the mean number of in-store purchases for each group
grouped = filtered_df.groupby("above_avg_gold")["NumStorePurchases"].mean()
grouped

above_avg_gold
False    4.747687
True     7.776536
Name: NumStorePurchases, dtype: float64

In [10]:
Total_purchase_amount = filtered_df.groupby("above_avg_gold")["MntGoldProds"].sum()

In [11]:
Total_purchase_amount

above_avg_gold
False    17376
True     53961
Name: MntGoldProds, dtype: int64

In [23]:
# conduct a two-sample t-test to determine whether there is a statistically significant difference in the mean number of in-store purchases between the above and below average gold spend groups
t_stat, p_val = ttest_ind(filtered_df.loc[filtered_df["above_avg_gold"], "NumStorePurchases"], filtered_df.loc[~filtered_df["above_avg_gold"], "NumStorePurchases"])

if p_val < 0.05:
    print("There is a statistically significant difference in the mean number of in-store purchases between the above and below average gold spend groups as the p_value {} is less than 0.05.".format(p_val))
else:
    print("There is no statistically significant difference in the mean number of in-store purchases between the above and below average gold spend groups as the p_value {} is greater than 0.05.".format(p_val))
print(t_stat)

There is a statistically significant difference in the mean number of in-store purchases between the above and below average gold spend groups as the p_value 7.470063213121885e-80 is less than 0.05.
19.951812724542737


In [25]:
from sklearn.feature_selection import f_classif
import scipy.stats as stats


In [26]:
# perform one-way ANOVA test
f_stat, p_val = stats.f_oneway(filtered_df.loc[filtered_df["above_avg_gold"], "NumStorePurchases"], filtered_df.loc[~filtered_df["above_avg_gold"], "NumStorePurchases"])


In [27]:
# Print the results
print("F-test results:")
print("f-statistic: {:.3f}".format(f_stat))
print("P-value: {:.3f}".format(p_val))


F-test results:
f-statistic: 398.075
P-value: 0.000
