In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# generate BeautifulSoup object 
url = "https://fairygodboss.com/maternity-leave-resource-center"
response = requests.post(url)
page = response.text
soup = BeautifulSoup(page,"html.parser")

In [3]:
# scrape company information
collect_cos = []
for line in soup.findAll('a',attrs = {'class':'comp_page'}):
    
    try:
        #filter out line breaks and remove extra white space
        co = [i.text.strip() for i in line.children if str(i) not in ['\n']] 
        
        #append company to list
        collect_cos.append(co)
        
    except:
        "something's wrong"
        continue
        
print "Number of companies scraped: {0}".format(len(collect_cos))

Number of companies scraped: 772


In [4]:
# create pandas dataframe object
df = pd.DataFrame(collect_cos, columns = ['company','industry','paid','unpaid'])

df.head()

Unnamed: 0,company,industry,paid,unpaid
0,Bill and Melinda Gates Foundation,Nonprofit,52 weeks,
1,Netflix,Technology: Consumer Internet,52 weeks,2 weeks
2,"Automattic, Inc.",Technology: Consumer Internet,32 weeks,0 weeks
3,Adobe Systems,Technology: Software,26 weeks,6 weeks
4,FireEye Inc.,Technology: Security,22 weeks,


In [5]:
# Question 1: how many different industry types do we have?

print "Industries: {0}".format(len(df.industry.unique()))

Industries: 110


In [6]:
# Question 2: are paid/unpaid time measured in units besides weeks?

for i,row in df.iterrows():
    if row['paid']!="N/A":
        if "weeks" not in row['paid']:
            print "Measured in units that are not weeks!"
    if row['unpaid']!="N/A":
        if "weeks" not in row['unpaid']:
            print "Measured in units that are not weeks!"

In [7]:
# Problem 1: convert paid and unpaid to numeric values

def extract_num(val):
    if val == "N/A":
        return None
    else:
        return int(val[:-6])

df['paid_leave'] = df['paid'].map(lambda x: extract_num(x))
df['unpaid_leave'] = df['unpaid'].map(lambda x: extract_num(x))

In [8]:
# Problem 2: clean industry field

def clean_industry(val):
    if val == "N/A": #create 'Unspecified' value to capture all null values
        return "Unspecified"
    
    if val == "akqaa": #company AKQA is coded with industry akqaa. Convert to "Technology"
        return "Technology"
    
    if val == "Auto parts manufacturing" or val == "Auto parts": #combine Auto parts and Auto parts manufacturing
        return "Auto manufacturers"
    
    else: #reduce number of industries by merging to base industry. eg. Technology: Software -> Technology
        return val.split(':')[0]
    
df['industry_parsed'] = df['industry'].map(lambda x: clean_industry(x))

In [9]:
# Question 1 (revisited): how many different industry types do we have?
# Industries inputted as "N/A" are now part of "Unspecified" category

print "Industries (parsed): {0}".format(len(df.industry_parsed.unique()))

Industries (parsed): 39


In [10]:
# Industry categories

df.industry_parsed.value_counts(dropna=False)

Technology                      111
Unspecified                      66
Legal services                   65
Finance                          58
Retail                           53
Healthcare                       35
Educational Services             35
Natural Resources                34
FMCG                             31
Insurance                        30
Media                            26
Consulting services              22
Pharmaceutical                   21
Nonprofit                        19
Industrial                       17
Transportation                   16
Business Services                15
Hospitality                      15
Information Services             14
Telecommunications               11
Advertising                      11
Government                       10
Auto manufacturers                9
Aerospace                         8
Accounting services               5
Services                          5
Conglomerate                      4
Real Estate Company         

In [11]:
df.sort('paid_leave', ascending=False).head(10)

Unnamed: 0,company,industry,paid,unpaid,paid_leave,unpaid_leave,industry_parsed
0,Bill and Melinda Gates Foundation,Nonprofit,52 weeks,,52,,Nonprofit
1,Netflix,Technology: Consumer Internet,52 weeks,2 weeks,52,2.0,Technology
2,"Automattic, Inc.",Technology: Consumer Internet,32 weeks,0 weeks,32,0.0,Technology
3,Adobe Systems,Technology: Software,26 weeks,6 weeks,26,6.0,Technology
4,FireEye Inc.,Technology: Security,22 weeks,,22,,Technology
5,"Orrick, Herrington & Sutcliffe LLP",Legal services,22 weeks,,22,,Legal services
6,Avaya,Technology: B2B Tech Services,22 weeks,,22,,Technology
8,Debevoise & Plimpton LLP,Legal services,20 weeks,,20,,Legal services
9,Microsoft,Technology: Software,20 weeks,0 weeks,20,0.0,Technology
7,Twitter,Technology: Consumer Internet,20 weeks,,20,,Technology


In [12]:
# Problem 3: dropping ALL null values decreases observations from 772 to 349! Let's not do that.

print len(df)
print len(df.dropna())

772
349


In [13]:
# group by industry

gb = df.groupby('industry_parsed')

In [14]:
# export average paid/unpaid leave (in weeks) to csv

gb['paid_leave','unpaid_leave'].mean().to_csv("industry_means.csv")

In [15]:
# include count (number of companies inputted for each industry value)

df_temp = gb['paid_leave','unpaid_leave'].agg(['mean', 'count'])

# shape it right!

df_temp = df_temp.stack(1).reset_index()
df_temp.columns = ['industry','measure','paid_leave','unpaid_leave']
df_temp = df_temp.set_index('industry')

df_temp.to_csv("industry_means_counts.csv")

In [16]:
df_temp.head()

Unnamed: 0_level_0,measure,paid_leave,unpaid_leave
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Accounting services,mean,10.0,8.5
Accounting services,count,5.0,2.0
Advertising,mean,6.363636,5.75
Advertising,count,11.0,4.0
Aerospace,mean,4.285714,9.6
