In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, pearsonr, f_oneway, chi2_contingency, shapiro
from sklearn.linear_model import LinearRegression

sns.set_style("whitegrid")

df = pd.read_csv("week7_statistical_dataset.csv")
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.to_period("M").astype(str)

print(df.shape)
df.head()


In [None]:

display(df.describe().T)
print("\nMissing values:")
print(df.isna().sum())


In [None]:

plt.figure(figsize=(10,4))
sns.histplot(df['Sales'], kde=True)
plt.title("Sales Distribution")
plt.tight_layout()
plt.show()


In [None]:

corr, p_value = pearsonr(df['MarketingSpend'], df['Revenue'])
print("Correlation:", corr)
print("P-value:", p_value)

sns.scatterplot(data=df, x='MarketingSpend', y='Revenue')
plt.title("Marketing vs Revenue")
plt.show()


In [None]:

online = df[df['Channel']=='Online']['Sales']
offline = df[df['Channel']=='Offline']['Sales']

t_stat, p_val = ttest_ind(online, offline, equal_var=False)
print("T-statistic:", t_stat)
print("P-value:", p_val)


In [None]:

mean_sales = df['Sales'].mean()
std_sales = df['Sales'].std()
n = len(df)

ci_low = mean_sales - 1.96*(std_sales/np.sqrt(n))
ci_high = mean_sales + 1.96*(std_sales/np.sqrt(n))

print("95% CI:", (ci_low, ci_high))


In [None]:

X = df[['MarketingSpend']]
y = df['Revenue']

lr = LinearRegression()
lr.fit(X, y)

print("Linear Regression Intercept:", lr.intercept_)
print("Slope:", lr.coef_[0])

pred = lr.predict(X)

plt.figure(figsize=(7,5))
sns.scatterplot(x=df['MarketingSpend'], y=df['Revenue'])
plt.plot(df['MarketingSpend'], pred, color='red')
plt.title("Linear Regression: Marketing â†’ Revenue")
plt.show()


In [None]:

groups = [ df[df['Region']==r]['Sales'] for r in df['Region'].unique() ]
f_stat, p_val_anova = f_oneway(*groups)

print("ANOVA F-stat:", f_stat)
print("ANOVA P-value:", p_val_anova)


In [None]:

table = pd.crosstab(df['Channel'], df['Category'])
chi2, p_chi, dof, expected = chi2_contingency(table)

print("Chi-square:", chi2)
print("P-value:", p_chi)
print("Degrees of freedom:", dof)


In [None]:

features = ['MarketingSpend', 'Revenue']
X = df[features]
y = df['Sales']

mlr = LinearRegression()
mlr.fit(X, y)

print("MLR Intercept:", mlr.intercept_)
for f,c in zip(features, mlr.coef_):
    print(f"{f} coefficient:", c)


In [None]:

stat, p_norm = shapiro(df['Sales'])
print("Shapiro Statistic:", stat)
print("P-value:", p_norm)


In [None]:

mean1 = online.mean()
mean2 = offline.mean()
std_pooled = np.sqrt((online.var() + offline.var()) / 2)

cohens_d = (mean1 - mean2) / std_pooled
print("Cohen's d:", cohens_d)
