## Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Loading the data

In [2]:
import pandas as pd

# Use a raw string to specify the file path
file_path = r'C:/Users/HP/Downloads/4.10.Hypothesis-testing-section-practical-example.csv'

# Read the CSV file
df = pd.read_csv(file_path, skiprows = 3)

# Now you can work with the 'df' DataFrame
df.drop(['Unnamed: 0','Unnamed: 11'], axis = 1, inplace = True)

In [3]:
df

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Bold,Caroline,63,Female,United States,White,7/2/2012,Executive Office,President & CEO,"$166,400.00"
1,Zamora,Jennifer,38,Female,United States,White,4/10/2010,IT/IS,CIO,"$135,200.00"
2,Houlihan,Debra,51,Female,United States,White,5/5/2014,Sales,Director of Sales,"$124,800.00"
3,Bramante,Elisa,34,Female,United States,Black or African American,1/5/2009,Production,Director of Operations,"$124,800.00"
4,Del Bosque,Keyla,38,Female,United States,Black or African American,1/9/2012,Software Engineering,Software Engineer,"$118,809.60"
...,...,...,...,...,...,...,...,...,...,...
169,Osturnka,Adeel,41,Male,United States,White,9/30/2013,Production,Production Technician I,"$33,280.00"
170,Punjabhi,Louis,56,Male,United States,White,1/6/2014,Production,Production Technician I,"$33,280.00"
171,Cockel,James,40,Male,United States,White,7/8/2013,Production,Production Technician I,"$31,200.00"
172,Gordon,David,38,Male,United States,White,7/2/2012,Production,Production Technician I,"$31,200.00"


## Is the company baised? Is there any wage gap between genders?

#### The company claims that there isn't any wage gap between genders so 
##### Ho : Do = 0
##### H1 : Do is not equal to 0

In [4]:
# Separating the females from males 

females = pd.DataFrame()
females = df[df['Gender'] == 'Female']
print('Total female employee: {0}'.format(len(females)))

males = pd.DataFrame()
males = df[df['Gender'] == 'Male']
print('Total male employee: {0}'.format(len(males)))

Total female employee: 98
Total male employee: 76


In [5]:
# Cleaning the salary column in both new dataframes

females.loc[:,'Salary'] = females['Salary'].str.replace(r'[\$,]', '', regex = True).str.split('.').str.get(0)
males.loc[:,'Salary'] = males['Salary'].str.replace(r'[\$,]', '', regex = True).str.split('.').str.get(0)

In [6]:
# Converting the data type of salary from object to numeric 

females.loc[:,'Salary'] = females['Salary'].astype(float)
males.loc[:,'Salary'] = males['Salary'].astype(float)

In [12]:
# Calculating the mean salary of both the genders 

f_mean = females['Salary'].mean()
m_mean = males['Salary'].mean()
print('The population mean of females: {0}'.format(f_mean))
print('The population mean of males: {0}'.format(m_mean))

The population mean of females: 65736.88775510204
The population mean of males: 72300.51315789473


In [16]:
# Calculating the variances of both the genders

v_female = females['Salary'].var(ddof = 1)
v_male = males['Salary'].var(ddof = 1)
print('The population variance of females: {0}'.format(v_female))
print('The population variance of males: {0}'.format(v_male))

The population variance of females: 1097617147.1109824
The population variance of males: 1241432517.293158


In [20]:
# Calculating the pooled variance 

pooled_variance = ((len(females) -1)*v_female + (len(males) -1)*v_male)/(len(females) + len(males) - 2)
pooled_variance

1160327337.5973961

In [24]:
# Now calculating the t score

t_score = (f_mean - m_mean) / np.sqrt(pooled_variance * (1/len(females) + 1/len(males)))
t_score

-1.2606622108846948

In [25]:
import scipy.stats as stats

In [28]:
degree_of_freedom = len(females) + len(males) - 2
p_value = stats.t.cdf(t_score, df = degree_of_freedom)
print('p_value for this scenario: {0}'.format(p_value))

p_value for this scenario: 0.10456901076481802


#### The value concludes that the null hypothesis is not rejectable as it is greater than any of the testing percentages that is the value of alpha and thus there is not any wage gap in the company

## Lets calculate the confidence interval

In [34]:
# calculating value of t statistics for 95% confidence

t_statistics = stats.t.ppf(1 - (1 - 0.05)/2, df = degree_of_freedom)

lower_bound = (f_mean - m_mean) - (t_statistics * (np.sqrt(pooled_variance * (1/len(females) + 1/len(males)))))
upper_bound = (f_mean - m_mean) + (t_statistics * (np.sqrt(pooled_variance * (1/len(females) + 1/len(males)))))

print('The upper bound of confidence interval is: {0}'.format(upper_bound))
print('The lower bound of confidence interval is: {0}'.format(lower_bound))

The upper bound of confidence interval is: -6236.666426456145
The lower bound of confidence interval is: -6890.584379129238
