Q3. Scrape the following static website to extract the given data:

Link: https://devtomanager.com/interviews/


In [1]:
import pandas as pd
import os
from bs4 import BeautifulSoup
from selenium import webdriver

In [2]:
# changing directory to webdriver file location
os.chdir("D:\Scripting")

In [3]:
url = "https://devtomanager.com/interviews/"

In [4]:
# remotely accessing browser
browser = webdriver.Firefox()

In [5]:
browser.get(url)

In [6]:
# getting the html code from the webpage
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
soup

<html lang="en-US"><head>
<title>
Interviews with Software Managers | Developer to Manager
</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, shrink-to-fit=no" name="viewport"/>
<meta content="Siddhant Goel" name="author"/>
<meta content="career, development, engineering-management, knowledge, leadership, management, platform, software" name="keywords"/>
<meta content="At Developer to Manager, we regularly interview software engineering managers on how they approach management and leadership." name="description"/>
<meta content="summary" name="twitter:card"/>
<meta content="Interviews with Software Managers | Developer to Manager" name="twitter:title"/>
<meta content="About transitioning from development to management" name="twitter:description"/>
<meta content="@devtomanager" name="twitter:site"/>
<meta content="https://devtomanager.com/static/images/logo-wide.png" name="twitter:image"/>
<meta content="https://devtomanager.com/interviews/page/1/" pr

In [7]:
soup.find_all(class_ = "card-title")[1].text.strip().split(",")

['Anand Safi', ' Engineering Manager at Mark43']

In [8]:
# scraping the employee names
emp_name = [i.text.strip().split(",")[0] for i in soup.find_all(class_ = "card-title")]
emp_name

['Aviv Ben-Yosef', 'Anand Safi', 'Shawn Axsom', 'Kevin Doyle', 'Arnab Sen']

In [9]:
len(emp_name)

5

In [10]:
soup.find_all(class_ = "card-title")[0].text.strip().split(",")[1].split("at",1)

[' Tech Executive Consultant']

In [11]:
# scraping the job positions
job_pos = [i.text.strip().split(",")[1].split("at",1)[0] for i in soup.find_all(class_ = "card-title")]
job_pos

[' Tech Executive Consultant',
 ' Engineering Manager ',
 ' Senior Engineering Manager ',
 ' CTO ',
 ' Technical Delivery Manager ']

In [12]:
len(job_pos)

5

In [13]:
soup.find_all(class_ = "card-title")[1].text.strip().split(",")[1].split("at",1)

[' Engineering Manager ', ' Mark43']

In [14]:
#scraping the working company
comp = []
for i in soup.find_all(class_ = "card-title"):
    if len(i.text.strip().split(",")[1].split("at",1))>1:
        comp.append(i.text.strip().split(",")[1].split("at",1)[1])
    else:
        comp.append("Independent")
comp

['Independent', ' Mark43', ' Docker', ' patientMpower', ' AKQA']

In [15]:
len(comp)

5

In [16]:
soup.find_all(class_ = "card-text")[0::2][0].text.strip()

'“I want to set up managers to succeed, no matter how much background they’ve already got.”'

In [17]:
# scraping the interview quotes
quotes = [i.text.strip() for i in soup.find_all(class_ = "card-text")[0::2]]
quotes

['“I want to set up managers to succeed, no matter how much background they’ve already got.”',
 '“I very well understand where my interests lie, which is being an enabler first and then a creator rather than being a creator/ maker 100%.”',
 '“Your focus (and challenges) grows from your direct reports, to teams, to departments, executives, and then external customers and partnerships.”',
 '“If I can provide enough direction to allow people to focus on the jobs they’ll do better than I ever could, everybody wins.”',
 '“The switch to management can be highly rewarding and provide a level of independence, authority, and interactivity that a pure software engineering role may not offer.”']

In [18]:
len(quotes)

5

In [19]:
soup.find_all(class_ = "card-text")[1::2][0].text.split("·\n")[0].strip()

'August 16, 2021'

In [20]:
# scraping the dates
dts = [i.text.split("·\n")[0].strip() for i in soup.find_all(class_ = "card-text")[1::2]]
dts

['August 16, 2021',
 'July 19, 2021',
 'July 05, 2021',
 'June 21, 2021',
 'May 10, 2021']

In [21]:
def dates(x):
    x = x.replace("January","01")
    x = x.replace("February","02")
    x = x.replace("March","03")
    x = x.replace("April","04")
    x = x.replace("May","05")
    x = x.replace("June","06")
    x = x.replace("July","07")
    x = x.replace("August","08")
    x = x.replace("September","09")
    x = x.replace("October","10")
    x = x.replace("November","11")
    x = x.replace("December","12")
    x = x.replace(", ","-").replace(" ","-")
    return x

In [22]:
dates('August 16, 2021')

'08-16-2021'

In [23]:
final_dates = [dates(i) for i in dts]
final_dates

['08-16-2021', '07-19-2021', '07-05-2021', '06-21-2021', '05-10-2021']

In [24]:
len(final_dates)

5

In [25]:
final_dates = pd.to_datetime(final_dates, format="%m-%d-%Y")
final_dates

DatetimeIndex(['2021-08-16', '2021-07-19', '2021-07-05', '2021-06-21',
               '2021-05-10'],
              dtype='datetime64[ns]', freq=None)

In [26]:
soup.find_all(class_ = "card-text")[1::2][0].text.split("·\n")[1].split()

['#coaching', '#consulting']

In [27]:
# scraping the tags
tags = [i.text.split("·\n")[1].split() for i in soup.find_all(class_ = "card-text")[1::2]]
tags

[['#coaching', '#consulting'],
 ['#coaching', '#frontend', '#full-stack', '#public-safety'],
 ['#backend', '#coaching', '#information-systems', '#web'],
 ['#full-stack', '#health-tech'],
 ['#consulting', '#digital-marketing', '#e-commerce', '#product']]

In [28]:
len(tags)

5

In [29]:
# creating a dataframe with the scraped data
df = pd.DataFrame({"Employee Name":emp_name,"Job Position":job_pos,"Working Company":comp,"Interview Quote":quotes,"Date":final_dates,"Tags":tags})
df

Unnamed: 0,Employee Name,Job Position,Working Company,Interview Quote,Date,Tags
0,Aviv Ben-Yosef,Tech Executive Consultant,Independent,"“I want to set up managers to succeed, no matt...",2021-08-16,"[#coaching, #consulting]"
1,Anand Safi,Engineering Manager,Mark43,“I very well understand where my interests lie...,2021-07-19,"[#coaching, #frontend, #full-stack, #public-sa..."
2,Shawn Axsom,Senior Engineering Manager,Docker,“Your focus (and challenges) grows from your d...,2021-07-05,"[#backend, #coaching, #information-systems, #web]"
3,Kevin Doyle,CTO,patientMpower,“If I can provide enough direction to allow pe...,2021-06-21,"[#full-stack, #health-tech]"
4,Arnab Sen,Technical Delivery Manager,AKQA,“The switch to management can be highly reward...,2021-05-10,"[#consulting, #digital-marketing, #e-commerce,..."


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Employee Name    5 non-null      object        
 1   Job Position     5 non-null      object        
 2   Working Company  5 non-null      object        
 3   Interview Quote  5 non-null      object        
 4   Date             5 non-null      datetime64[ns]
 5   Tags             5 non-null      object        
dtypes: datetime64[ns](1), object(5)
memory usage: 368.0+ bytes


In [31]:
# creating a reusable function which returns the dataframe
def soft_managers():
    emp_name = [i.text.strip().split(",")[0] for i in soup.find_all(class_ = "card-title")]
    job_pos = [i.text.strip().split(",")[1].split("at",1)[0] for i in soup.find_all(class_ = "card-title")]
    comp = []
    for i in soup.find_all(class_ = "card-title"):
        if len(i.text.strip().split(",")[1].split("at",1))>1:
            comp.append(i.text.strip().split(",")[1].split("at",1)[1])
        else:
            comp.append("Independent")
    quotes = [i.text.strip() for i in soup.find_all(class_ = "card-text")[0::2]]
    dts = [i.text.split("·\n")[0].strip() for i in soup.find_all(class_ = "card-text")[1::2]]
    final_dates = [dates(i) for i in dts]
    final_dates = pd.to_datetime(final_dates, format="%m-%d-%Y")
    tags = [i.text.split("·\n")[1].split() for i in soup.find_all(class_ = "card-text")[1::2]]
    df = pd.DataFrame({"Employee Name":emp_name,"Job Position":job_pos,"Working Company":comp,"Interview Quote":quotes,"Date":final_dates,"Tags":tags})
    return df

In [32]:
# scraping data from first 5 pages
table = []
stud = "https://devtomanager.com/interviews/page/{}/"

for i in range(1,6):
    url_all = stud.format(i)
    browser.get(url_all)
    html=browser.page_source
    soup = BeautifulSoup(html, "html.parser")
    table.append(soft_managers())


In [33]:
# dataframe of details from first five pages
final_df = pd.concat(table,axis=0)
final_df

Unnamed: 0,Employee Name,Job Position,Working Company,Interview Quote,Date,Tags
0,Aviv Ben-Yosef,Tech Executive Consultant,Independent,"“I want to set up managers to succeed, no matt...",2021-08-16,"[#coaching, #consulting]"
1,Anand Safi,Engineering Manager,Mark43,“I very well understand where my interests lie...,2021-07-19,"[#coaching, #frontend, #full-stack, #public-sa..."
2,Shawn Axsom,Senior Engineering Manager,Docker,“Your focus (and challenges) grows from your d...,2021-07-05,"[#backend, #coaching, #information-systems, #web]"
3,Kevin Doyle,CTO,patientMpower,“If I can provide enough direction to allow pe...,2021-06-21,"[#full-stack, #health-tech]"
4,Arnab Sen,Technical Delivery Manager,AKQA,“The switch to management can be highly reward...,2021-05-10,"[#consulting, #digital-marketing, #e-commerce,..."
0,Nicholas Cobb,Senior Manager of Software Engineering,Aurora Innovation,“Realizing that life as a manager will have it...,2021-04-26,"[#autonomous-driving, #devops, #ios, #self-tau..."
1,Christoph Eicke,Team Lead,Xing GmbH,“Sometimes it's better not to know the details...,2021-04-12,"[#full-stack, #linux, #self-taught, #web]"
2,Sam Phillips,Co-founder & CTO,Residently,“I think leadership and management are roles t...,2021-03-29,"[#coaching, #devops, #self-taught, #startup, #..."
3,Dan Lines,Co-founder & COO,LinearB,“Some sound advice I received is that if your ...,2021-03-15,"[#cybersecurity, #entrepreneurship, #startup]"
4,Duncan Skelton,Executive Coach,Duncan Skelton Coaching,"“If you get the job, or are new to the role, t...",2021-03-01,"[#coaching, #enterprise, #hardware, #systems-p..."


In [34]:
final_df.shape

(25, 6)

In [35]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 4
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Employee Name    25 non-null     object        
 1   Job Position     25 non-null     object        
 2   Working Company  25 non-null     object        
 3   Interview Quote  25 non-null     object        
 4   Date             25 non-null     datetime64[ns]
 5   Tags             25 non-null     object        
dtypes: datetime64[ns](1), object(5)
memory usage: 1.4+ KB


In [36]:
# creating a csv file with the dataframe
final_df.to_csv("Software Managers.csv")
pd.read_csv("Software Managers.csv").head()

Unnamed: 0.1,Unnamed: 0,Employee Name,Job Position,Working Company,Interview Quote,Date,Tags
0,0,Aviv Ben-Yosef,Tech Executive Consultant,Independent,"“I want to set up managers to succeed, no matt...",2021-08-16,"['#coaching', '#consulting']"
1,1,Anand Safi,Engineering Manager,Mark43,“I very well understand where my interests lie...,2021-07-19,"['#coaching', '#frontend', '#full-stack', '#pu..."
2,2,Shawn Axsom,Senior Engineering Manager,Docker,“Your focus (and challenges) grows from your d...,2021-07-05,"['#backend', '#coaching', '#information-system..."
3,3,Kevin Doyle,CTO,patientMpower,“If I can provide enough direction to allow pe...,2021-06-21,"['#full-stack', '#health-tech']"
4,4,Arnab Sen,Technical Delivery Manager,AKQA,“The switch to management can be highly reward...,2021-05-10,"['#consulting', '#digital-marketing', '#e-comm..."
