In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
# generate BeautifulSoup object 
url = "https://fairygodboss.com/maternity-leave-resource-center"
response = requests.post(url)
page = response.text
soup = BeautifulSoup(page,"html.parser")

In [41]:
# scrape company information
collect_cos = []
for line in soup.findAll('a',attrs = {'class':'comp_page'}):
    
    try:
        #filter out line breaks and remove extra white space
        co = [i.text.strip() for i in line.children if str(i) not in ['\n']] 
        
        #append company to list
        collect_cos.append(co)
        
    except:
        "something's wrong"
        continue
        
print "Number of companies scraped: {0}".format(len(collect_cos))

Number of companies scraped: 727


In [42]:
# create pandas dataframe object
df = pd.DataFrame(collect_cos, columns = ['company','industry','paid','unpaid'])

df.head()

Unnamed: 0,company,industry,paid,unpaid
0,Bill and Melinda Gates Foundation,Nonprofit,52 weeks,
1,Netflix,Technology: Consumer Internet,52 weeks,2 weeks
2,"Automattic, Inc.",Technology: Consumer Internet,32 weeks,0 weeks
3,Adobe Systems,Technology: Software,26 weeks,6 weeks
4,FireEye Inc.,Technology: Security,22 weeks,


In [43]:
# Question 1: how many different industry types do we have?

print "Industries: {0}".format(len(df.industry.unique()))

Industries: 107


In [44]:
# Question 2: are paid/unpaid time measured in units besides weeks?

for i,row in df.iterrows():
    if row['paid']!="N/A":
        if "weeks" not in row['paid']:
            print "Measured in units that are not weeks!"
    if row['unpaid']!="N/A":
        if "weeks" not in row['unpaid']:
            print "Measured in units that are not weeks!"

In [45]:
# Problem 1: convert paid and unpaid to numeric values

def extract_num(val):
    if val == "N/A":
        return None
    else:
        return int(val[:-6])

df['paid_leave'] = df['paid'].map(lambda x: extract_num(x))
df['unpaid_leave'] = df['unpaid'].map(lambda x: extract_num(x))

In [46]:
# Problem 2: clean industry field

def clean_industry(val):
    if val == "N/A": #create 'Unspecified' value to capture all null values
        return "Unspecified"
    
    if val == "akqaa": #company AKQA is coded with industry akqaa. Convert to "Technology"
        return "Technology"
    
    if val == "Auto parts manufacturing" or val == "Auto parts": #combine Auto parts and Auto parts manufacturing
        return "Auto manufacturers"
    
    else: #reduce number of industries by merging to base industry. eg. Technology: Software -> Technology
        return val.split(':')[0]
    
df['industry_parsed'] = df['industry'].map(lambda x: clean_industry(x))

In [47]:
# Question 1 (revisited): how many different industry types do we have?
# Industries inputted as "N/A" are now part of "Unspecified" category

print "Industries (parsed): {0}".format(len(df.industry_parsed.unique()))

Industries (parsed): 37


In [48]:
# Industry categories

df.industry_parsed.value_counts(dropna=False)

Unspecified                     103
Technology                       92
Legal services                   62
Finance                          55
Retail                           46
Natural Resources                32
FMCG                             31
Educational Services             30
Insurance                        28
Healthcare                       26
Media                            25
Consulting services              19
Pharmaceutical                   17
Nonprofit                        17
Industrial                       15
Transportation                   15
Information Services             13
Hospitality                      12
Telecommunications               11
Business Services                11
Advertising                      10
Auto manufacturers                8
Government                        8
Aerospace                         6
Accounting services               5
Real Estate Company               4
Publishing                        4
Conglomerate                

In [49]:
df.sort('paid_leave', ascending=False).head(10)

Unnamed: 0,company,industry,paid,unpaid,paid_leave,unpaid_leave,industry_parsed
0,Bill and Melinda Gates Foundation,Nonprofit,52 weeks,,52,,Nonprofit
1,Netflix,Technology: Consumer Internet,52 weeks,2 weeks,52,2.0,Technology
2,"Automattic, Inc.",Technology: Consumer Internet,32 weeks,0 weeks,32,0.0,Technology
3,Adobe Systems,Technology: Software,26 weeks,6 weeks,26,6.0,Technology
4,FireEye Inc.,Technology: Security,22 weeks,,22,,Technology
5,"Orrick, Herrington & Sutcliffe LLP",Legal services,22 weeks,,22,,Legal services
6,Avaya,Technology: B2B Tech Services,22 weeks,,22,,Technology
8,Debevoise & Plimpton LLP,Legal services,20 weeks,,20,,Legal services
9,Microsoft,Technology: Software,20 weeks,0 weeks,20,0.0,Technology
7,Twitter,Technology: Consumer Internet,20 weeks,,20,,Technology


In [50]:
# Problem 3: dropping ALL null values decreases observations from 710 to 325! Let's not do that.

print len(df)
print len(df.dropna())

727
325


In [51]:
# group by industry

gb = df.groupby('industry_parsed')

In [52]:
# export average paid/unpaid leave (in weeks) to csv

gb['paid_leave','unpaid_leave'].mean().to_csv("industry_means.csv")

In [53]:
gb['paid_leave','unpaid_leave'].agg(['mean', 'count'])

Unnamed: 0_level_0,paid_leave,paid_leave,unpaid_leave,unpaid_leave
Unnamed: 0_level_1,mean,count,mean,count
industry_parsed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Accounting services,10.0,5,8.5,2
Advertising,6.8,10,5.75,4
Aerospace,5.0,6,9.0,4
Auto dealers,8.0,2,3.5,2
Auto manufacturers,6.75,4,14.857143,7
Business Services,5.375,8,10.111111,9
Conglomerate,7.75,4,2.666667,3
Consulting services,10.421053,19,10.454545,11
Diversified industrials,6.0,2,12.0,2
Educational Services,4.24,25,10.782609,23


In [54]:
# include count