In [3]:
import numpy as np
import pandas as pd

import scipy.stats as stats
from scipy.stats import pearsonr
from scipy.stats import f_oneway
from scipy.stats import chisquare
from scipy.stats import chi2_contingency

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.descriptivestats import Description

In [4]:
data = pd.read_csv("sales.csv")

Linear Regression

In [3]:
data.head()

Unnamed: 0,date,warehouse,client_type,product_line,quantity,unit_price,total,payment
0,1/6/2021,Central,Retail,Miscellaneous,8,16.85,134.83,Credit card
1,1/6/2021,North,Retail,Breaking system,9,19.29,173.61,Cash
2,1/6/2021,North,Retail,Suspension & traction,8,32.93,263.45,Credit card
3,1/6/2021,North,Wholesale,Frame & body,16,37.84,605.44,Transfer
4,1/6/2021,Central,Retail,Engine,2,60.48,120.96,Credit card


In [4]:
formula = "total ~ unit_price + quantity"
lm = ols(formula, data).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  total   R-squared:                       0.874
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     3465.
Date:                Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                        08:21:45   Log-Likelihood:                -6226.0
No. Observations:                1000   AIC:                         1.246e+04
Df Residuals:                     997   BIC:                         1.247e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -291.2467     10.900    -26.719      0.0

In [5]:
data.dtypes

date             object
warehouse        object
client_type      object
product_line     object
quantity          int64
unit_price      float64
total           float64
payment          object
dtype: object

ANOVA

In [6]:
formula = "total ~ warehouse + client_type + warehouse:client_type"
data_lm = ols(formula,data = data).fit()
table = sm.stats.anova_lm(data_lm, typ=2)
print(table)



                             sum_sq     df           F         PR(>F)
warehouse              1.254027e+05    2.0    0.931220   3.944161e-01
client_type            5.126056e+07    1.0  761.305381  6.945700e-125
warehouse:client_type  6.964879e+05    2.0    5.172008   5.826833e-03
Residual               6.692846e+07  994.0         NaN            NaN


In [7]:
formula = "total ~ C(warehouse)+C(client_type)+C(warehouse):C(client_type)"
data_lm = ols(formula,data = data).fit()
table = sm.stats.anova_lm(data_lm, typ=2)
print(table)


                                   sum_sq     df           F         PR(>F)
C(warehouse)                 1.254027e+05    2.0    0.931220   3.944161e-01
C(client_type)               5.126056e+07    1.0  761.305381  6.945700e-125
C(warehouse):C(client_type)  6.964879e+05    2.0    5.172008   5.826833e-03
Residual                     6.692846e+07  994.0         NaN            NaN


T Test

In [8]:
r = data["warehouse"].unique()
r

array(['Central', 'North', 'West'], dtype=object)

In [9]:
x = data.loc[data["client_type"] == "Retail", "total"]
y = data.loc[data["client_type"] == "Wholesale", "total"]
x, y

(0      134.83
 1      173.61
 2      263.45
 4      120.96
 6       54.41
         ...  
 989    280.27
 993    170.21
 994    184.42
 995    295.83
 998    241.23
 Name: total, Length: 775, dtype: float64,
 3       605.44
 5      1494.80
 8      1579.87
 36      815.22
 40      588.64
         ...   
 991     231.20
 992     920.37
 996     320.96
 997     393.64
 999     548.13
 Name: total, Length: 225, dtype: float64)

In [10]:
stats.ttest_ind(x,y)

Ttest_indResult(statistic=-27.492951889501615, pvalue=2.4937172679086875e-124)

One way anova

In [11]:
data["warehouse"].unique()

array(['Central', 'North', 'West'], dtype=object)

In [12]:
a = data.loc[data["warehouse"] == "Central", "total"]
b = data.loc[data["warehouse"] == "North", "total"]
c = data.loc[data["warehouse"] == "West", "total"]

In [13]:
f_oneway(a, b, c)

F_onewayResult(statistic=0.7439019539119799, pvalue=0.47551947059515853)

Correlation

In [14]:
data.head()

Unnamed: 0,date,warehouse,client_type,product_line,quantity,unit_price,total,payment
0,1/6/2021,Central,Retail,Miscellaneous,8,16.85,134.83,Credit card
1,1/6/2021,North,Retail,Breaking system,9,19.29,173.61,Cash
2,1/6/2021,North,Retail,Suspension & traction,8,32.93,263.45,Credit card
3,1/6/2021,North,Wholesale,Frame & body,16,37.84,605.44,Transfer
4,1/6/2021,Central,Retail,Engine,2,60.48,120.96,Credit card


In [15]:
p = pearsonr(data["total"],data["unit_price"])
p

PearsonRResult(statistic=0.3729422796236053, pvalue=2.36002148210046e-34)

Chi-Square

In [25]:
f = pd.crosstab(index = data["warehouse"], columns = data["client_type"], margins=True)
f

client_type,Retail,Wholesale,All
warehouse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central,371,109,480
North,263,77,340
West,141,39,180
All,775,225,1000


In [14]:
value = np.array(f)

In [16]:
value

array([[ 371,  109,  480],
       [ 263,   77,  340],
       [ 141,   39,  180],
       [ 775,  225, 1000]], dtype=int64)

In [19]:
chisquare(f)

Power_divergenceResult(statistic=array([585.00903226, 171.83111111, 756.8       ]), pvalue=array([1.79128438e-126, 5.12088498e-037, 1.01151283e-163]))

In [24]:
from scipy.stats import chi2_contingency 
chi2_contingency(value)[0:3]

(0.08784875957551479, 0.9999863329643959, 6)