### Import utility libraries

In [1]:
# importing utility libraries

import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

### get HTML from the web page

In [2]:
URL = "https://internshala.com/internships/data%20science-internship"
#conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, "html.parser")
#printing soup in a more structured tree format that makes for easier reading
print(soup.prettify())

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="https://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta content="IE=9" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0 user-scalable=0" name="viewport"/>
  <meta content="272234782795210" property="fb:app_id"/>
  <meta content="article" property="og:type"/>
  <meta content="1200" property="og:image:width"/>
  <meta content="630" property="og:image:height"/>
  <meta content="@Internshala" name="twitter:site"/>
  <meta content="summary_large_image" name="twitter:card"/>
  <meta content="@internshala" name="twitter:creator"/>
  <meta content="https://internshala.com//static/images/internships_for_facebook.png" name="twitter:image:src"/>
  <meta content="#ffffff" name="theme-color"/>
  <meta content="#ffffff" name="msapplication-navbutton-color"/>
  <meta content="telephone=no" name="format-detection"/>
  <script defer="" src="/stati

### Extract Job title

In [3]:
def extract_job_title_from_result(job_div, job_post):    
    for a in job_div.find(name="div", attrs={"class":"company"}).find(name="div", attrs={"class":"profile"}).find(name="a"):
        job_post.append(a)

### Company name

In [4]:
def extract_company_from_result(job_div, job_post): 
    for a in job_div.find(name="div", attrs={"class":"individual_internship_header"}).find(name="div", attrs={"class":"company"}).find(name="div", attrs={"class":"company_name"}).find(name="a"):
        job_post.append(str(a).strip())

### Location

In [5]:
def extract_location_from_result(job_div, job_post): 
    for a in job_div.find(name="div", attrs={"class":"individual_internship_details"}).find(name="div", attrs={"id":"location_names"}).find(name="a", attrs={"class":"location_link"}):
        job_post.append(str(a).strip())

### Salaries

In [6]:
def extract_salary_from_result(job_div, job_post): 
    for stipend in job_div.find(name="div", attrs={"class":"individual_internship_details"}).find(name="div", attrs={"class":"internship_other_details_container"}).find(name="span", attrs={"class":"stipend"}):        
        if isinstance(stipend, bs4.element.NavigableString):
            job_post.append(str(stipend).strip())

### Duration

In [7]:
def extract_duration_from_result(job_div, job_post): 
    for stipend in job_div.find(name="div", attrs={"class":"individual_internship_details"}).find(name="div", attrs={"class":"internship_other_details_container"}).find(name="span", attrs={"class":"stipend"}):        
        if isinstance(stipend, bs4.element.NavigableString):
            job_post.append(str(stipend).strip())

### URL of Internships page

In [8]:
def Get_URL_Of_page(base_url, skill):
    return base_url + "/internships/" + skill + "-internship"    

### Total pages

In [9]:
def Get_total_pages(url):
    # get total number of pages
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser", from_encoding="utf-8")
    total_pages = soup.find(name="span", attrs={"id":"total_pages"})
    return int(total_pages.text.strip())

### Internship Description page URL

In [10]:
def Get_Internship_Description_Page_Url(job_div, base_url):
    for a in job_div.find(name="div", attrs={"class":"button_container"}).find_all(name="a", attrs={"class":"view_detail_button"}, href=True):
        return base_url + a['href']

### Description

In [11]:
def extract_description_from_result(job_div, job_post):
    page = requests.get(Get_Internship_Description_Page_Url(job_div, base_url))
    soup_desc = BeautifulSoup(page.text, "html.parser", from_encoding="utf-8")
    
    for div in soup_desc.find_all(name="div", attrs={"class":"section_heading heading_5_5"}):
        if any(word in div.text.strip() for word in ["job/internship", "internship", "work from home", "part time", "job"]):
            job_post.append(div.next_sibling.next_sibling.text.strip())

### Scrap data

In [12]:
#scraping code:
columns = ["job_title", "company_name", "location", "salary", "description"]
sample_df = pd.DataFrame(columns = columns)
base_url = "https://internshala.com"
skill = "data science"


def Scrap_Internshala(base_url, skill):
    url = Get_URL_Of_page(base_url, skill)
    total_pages = Get_total_pages(url)
    for page_number in range(total_pages):
        page = requests.get(url + "/page-" + str(page_number + 1))
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, "html.parser", from_encoding="utf-8")

        for div in soup.find_all(name="div", attrs={"class":"individual_internship"}): 
            #specifying row num for index of job posting in dataframe
            num = (len(sample_df) + 1)
            #creating an empty list to hold the data for each posting
            job_post = []
            #grabbing job title
            extract_job_title_from_result(div, job_post)
            #grabbing company name
            extract_company_from_result(div, job_post)
            #grabbing location name
            extract_location_from_result(div, job_post)
            #grabbing salary
            extract_salary_from_result(div, job_post)
            #grabbing internship description
            extract_description_from_result(div, job_post)
            #appending list of job post info to dataframe at index num
            sample_df.loc[num] = job_post

        #saving sample_df as a local csv file — define your own local path to save contents 
        sample_df.to_csv("job_scrapping_internshala.csv", encoding="utf-8")

In [14]:
Scrap_Internshala(base_url, skill)

https://internshala.com data science


