In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
df = pd.read_csv('salespeople.csv')
df.head(10)

Unnamed: 0,promoted,sales,customer_rate,performance
0,0,594.0,3.94,2.0
1,0,446.0,4.06,3.0
2,1,674.0,3.83,4.0
3,0,525.0,3.62,2.0
4,1,657.0,4.4,3.0
5,1,918.0,4.54,2.0
6,0,318.0,3.09,3.0
7,0,364.0,4.89,1.0
8,0,342.0,3.74,3.0
9,0,387.0,3.0,3.0


In [3]:
high_performing = df[df['performance']==4]
low_performing = df[df['performance']==1]

In [4]:
d = high_performing['sales'].mean() - low_performing['sales'].mean()
d

np.float64(154.9742424242424)

In [5]:
t_test = stats.ttest_ind(high_performing.sales, low_performing.sales,equal_var=False, alternative="greater")
print(t_test)

TtestResult(statistic=np.float64(4.629477606844271), pvalue=np.float64(5.466221730788518e-06), df=np.float64(100.9768911762055))


In [6]:
hp_smean = high_performing['sales'].mean()
lp_smean = low_performing['sales'].mean()
hp_svar = high_performing['sales'].var()
lp_svar = low_performing['sales'].var()
sample_error = 2*np.sqrt(hp_svar/high_performing.size + lp_svar/low_performing.size)
sample_error

np.float64(33.47553559717553)

In [7]:
hp_svar/high_performing.size

np.float64(182.16610651974287)

In [8]:
stat = 0
for s in high_performing.sales:
    stat += (s - hp_smean)**2
stat

np.float64(2164133.3454545457)

In [9]:
def welch_dof(sample1, sample2):
    n1, n2 = len(sample1), len(sample2)
    s1, s2 = np.std(sample1, ddof=1), np.std(sample2, ddof=1)

    numerator = (s1 ** 2 / n1 + s2 ** 2 / n2) ** 2
    denominator = (s1 ** 2 / n1) ** 2 / (n1 - 1) + (s2 ** 2 / n2) ** 2 / (n2 - 1)

    return numerator / denominator

In [10]:
dof = welch_dof(high_performing.sales, low_performing.sales)
dof

np.float64(100.97689117620553)

In [11]:
stats.t.cdf(x=0, df=dof, loc=d, scale=sample_error)

np.float64(5.466221730788519e-06)

In [12]:
(0-d)/sample_error

np.float64(-4.629477606844271)

In [13]:
cleansed = df.dropna()
sample_rho = stats.pearsonr(cleansed.sales, cleansed.customer_rate)
print(sample_rho)

PearsonRResult(statistic=np.float64(0.33780504485867807), pvalue=np.float64(8.64795221209228e-11))


In [14]:
contingency = pd.crosstab(df.promoted, df.performance)
contingency

performance,1.0,2.0,3.0,4.0
promoted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,50,85,77,25
1,10,25,48,30


In [15]:
stats.chi2_contingency(contingency)

Chi2ContingencyResult(statistic=np.float64(25.895405268094862), pvalue=np.float64(1.0030629464566802e-05), dof=3, expected_freq=array([[40.62857143, 74.48571429, 84.64285714, 37.24285714],
       [19.37142857, 35.51428571, 40.35714286, 17.75714286]]))