In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
# generate BeautifulSoup object 
url = "https://fairygodboss.com/maternity-leave-resource-center"
response = requests.post(url)
page = response.text
soup = BeautifulSoup(page,"html.parser")

In [20]:
# scrape company information
collect_cos = []
for line in soup.findAll('a',attrs = {'class':'comp_page'}):
    
    try:
        #filter out line breaks and remove extra white space
        co = [i.text.strip() for i in line.children if str(i) not in ['\n']] 
        
        #append company to list
        collect_cos.append(co)
        
    except:
        "something's wrong"
        continue
        
print "Number of companies scraped: {0}".format(len(collect_cos))

Number of companies scraped: 712


In [21]:
# create pandas dataframe object
df = pd.DataFrame(collect_cos, columns = ['company','industry','paid','unpaid'])

df.head()

Unnamed: 0,company,industry,paid,unpaid
0,Bill and Melinda Gates Foundation,Nonprofit,52 weeks,
1,Netflix,Technology: Consumer Internet,52 weeks,2 weeks
2,"Automattic, Inc.",,32 weeks,0 weeks
3,Adobe Systems,Technology: Software,26 weeks,6 weeks
4,FireEye Inc.,Technology: Security,22 weeks,


In [22]:
# Question 1: how many different industry types do we have?

print "Industries: {0}".format(len(df.industry.unique()))

Industries: 106


In [23]:
# Question 2: are paid/unpaid time measured in units besides weeks?

for i,row in df.iterrows():
    if row['paid']!="N/A":
        if "weeks" not in row['paid']:
            print "Measured in units that are not weeks!"
    if row['unpaid']!="N/A":
        if "weeks" not in row['unpaid']:
            print "Measured in units that are not weeks!"

In [24]:
# Problem 1: convert paid and unpaid to numeric values

def extract_num(val):
    if val == "N/A":
        return None
    else:
        return int(val[:-6])

df['paid_leave'] = df['paid'].map(lambda x: extract_num(x))
df['unpaid_leave'] = df['unpaid'].map(lambda x: extract_num(x))

In [25]:
# Problem 2: reduce number of industries by merging by base industry 
# eg. Technology: Software -> Technology

def extract_base_industry(val):
    if val == "N/A":
        return None
    else:
        return val.split(':')[0]
    
df['industry_parsed'] = df['industry'].map(lambda x: extract_base_industry(x))

In [26]:
# Question 1 (revisited): how many different industry types do we have?
# does not include null values

print "Industries (parsed): {0}".format(len(df.industry_parsed.unique()))

Industries (parsed): 39


In [27]:
# Industry categories

df.industry_parsed.value_counts(dropna=False)

NaN                             94
Technology                      87
Legal services                  62
Finance                         55
Retail                          46
Natural Resources               32
FMCG                            31
Educational Services            30
Insurance                       28
Healthcare                      25
Media                           25
Consulting services             19
Nonprofit                       17
Pharmaceutical                  17
Industrial                      15
Transportation                  15
Information Services            13
Hospitality                     12
Telecommunications              11
Business Services               11
Advertising                     10
Government                       8
Auto manufacturers               6
Aerospace                        6
Accounting services              5
Conglomerate                     4
Real Estate Company              4
Pharmacies and Drug Stores       4
Publishing          

In [28]:
df.head()

Unnamed: 0,company,industry,paid,unpaid,paid_leave,unpaid_leave,industry_parsed
0,Bill and Melinda Gates Foundation,Nonprofit,52 weeks,,52,,Nonprofit
1,Netflix,Technology: Consumer Internet,52 weeks,2 weeks,52,2.0,Technology
2,"Automattic, Inc.",,32 weeks,0 weeks,32,0.0,
3,Adobe Systems,Technology: Software,26 weeks,6 weeks,26,6.0,Technology
4,FireEye Inc.,Technology: Security,22 weeks,,22,,Technology


In [29]:
# Problem 3: dropping ALL null values decreases observations from 710 to 272!

print len(df)
print len(df.dropna())

712
273


In [30]:
gb = df.groupby('industry_parsed')

In [31]:
gb['paid_leave','unpaid_leave'].mean().to_csv("industry_means.csv")

In [32]:
gb['paid_leave','unpaid_leave'].mean()

Unnamed: 0_level_0,paid_leave,unpaid_leave
industry_parsed,Unnamed: 1_level_1,Unnamed: 2_level_1
Accounting services,10.0,8.5
Advertising,6.8,5.75
Aerospace,6.0,9.0
Auto dealers,8.0,3.5
Auto manufacturers,6.75,16.0
Auto parts,,12.0
Auto parts manufacturing,,12.0
Business Services,5.375,10.111111
Conglomerate,7.75,2.666667
Consulting services,10.421053,10.454545
