In [6]:
import requests
from bs4 import BeautifulSoup
import time

In [2]:
base_url = "https://www.tutorialspoint.com"
library_url = "/tutorialslibrary.htm"
r = requests.get(base_url + library_url)
soup = BeautifulSoup(r.content, "html.parser")

In [3]:
# function for getting all the tutorials in one category
def get_topics(data_list,category_list,category_name,completed=False):
    # we pass in data_list as parameter for each topic we go over 
    # we will append that tutorial into the data_list variable
    
    for topic in category_list.find_all("li"):
        data_dict = {}
        data_dict["Category"] = category_name.get_text()
        a_tag = topic.find("a")
        data_dict["Topic"] = a_tag.get_text()
        data_dict["URL"] = base_url + a_tag["href"]
        data_dict["Completed"] = False
        
        data_list.append(data_dict)

In [22]:
frame = soup.select_one(".row.featured-boxes")

def get_all_data(data_list=[]):
    start = time.time()
    c = 0
    lib_cols = frame.find_all("div",class_="mui-col-md-3")
    for column in lib_cols: 
        category_lists = column.find_all("ul")
        category_names = column.find_all("h4")
        
        for clist, cname in zip(category_lists, category_names): 
            get_topics(data_list,clist,cname)
            c += 1
            if c % 100 == 0:
                print(f"Currently on {c}")
           
    end = time.time()
    print(f"It took {end-start:.2f} seconds")
    return data_list

In [35]:
data_list = get_all_data([])

It took 0.10 seconds


In [41]:
# checking if I got any errors while scraping
len(frame.select("li")) == len(data_list)

True

In [40]:
data_list

[{'Category': 'Academic',
  'Topic': 'CBSE Syllabus',
  'URL': 'https://www.tutorialspoint.com/cbse_syllabus/index.htm',
  'Completed': False},
 {'Category': 'Academic',
  'Topic': 'Learn Accounting Basics',
  'URL': 'https://www.tutorialspoint.com/accounting_basics/index.htm',
  'Completed': False},
 {'Category': 'Academic',
  'Topic': 'Auditing',
  'URL': 'https://www.tutorialspoint.com/auditing/index.htm',
  'Completed': False},
 {'Category': 'Academic',
  'Topic': 'Course on Computer Concepts (CCC) Tutorial',
  'URL': 'https://www.tutorialspoint.com/computer_concepts/index.htm',
  'Completed': False},
 {'Category': 'Academic',
  'Topic': 'Learn Financial Accounting',
  'URL': 'https://www.tutorialspoint.com/financial_accounting/index.htm',
  'Completed': False},
 {'Category': 'Academic',
  'Topic': 'Learn Forex Trading',
  'URL': 'https://www.tutorialspoint.com/forex_trading/index.htm',
  'Completed': False},
 {'Category': 'Academic',
  'Topic': 'Learn Statistics',
  'URL': 'https:

In [11]:
# saving the data into a JSON file
import json
def save_json_data(data_list,filename,directory="../data/"):
    with open(directory+filename,"w", encoding='utf-8') as f:
        json.dump(data_list,f,ensure_ascii=False,indent=4)
    return

# this function returns the data
def load_json_data(filename,directory="../data/"):
    with open(directory+filename,"r",encoding="utf-8") as f:
        data = json.load(f)
    return data

In [46]:
save_json_data(data_list,"tutorials.json")

In [12]:
data_list = load_json_data("tutorials.json")

In [13]:
# converting our data list to a pandas dataframe
import pandas as pd
df = pd.DataFrame(data_list)

In [54]:
# viewing unique categories and sort the ones we are interested in
df.Category.unique()

array(['Academic', 'Computer Science', 'Digital Marketing', 'Monuments',
       'Machine Learning', 'Mathematics', 'Mobile Development', 'SAP',
       'Software Quality', 'Big Data & Analytics', 'Databases',
       'Engineering Tutorials', 'Mainframe Development',
       'Microsoft Technologies', 'Java Technologies', 'XML Technologies',
       'Python Technologies', 'Sports', 'Computer Programming', 'DevOps',
       'Latest Technologies', 'Telecom', 'Exams Syllabus',
       'UPSC IAS Exams', 'Web Development', 'Scripts', 'Management',
       'Soft Skills', 'Selected Reading', 'Misc'], dtype=object)

In [14]:
# getting the list of all the tutorial URls
list(df.URL)

['https://www.tutorialspoint.com/cbse_syllabus/index.htm',
 'https://www.tutorialspoint.com/accounting_basics/index.htm',
 'https://www.tutorialspoint.com/auditing/index.htm',
 'https://www.tutorialspoint.com/computer_concepts/index.htm',
 'https://www.tutorialspoint.com/financial_accounting/index.htm',
 'https://www.tutorialspoint.com/forex_trading/index.htm',
 'https://www.tutorialspoint.com/statistics/index.htm',
 'https://www.tutorialspoint.com/adaptive_software_development/index.htm',
 'https://www.tutorialspoint.com/agile/index.htm',
 'https://www.tutorialspoint.com/agile_data_science/index.htm',
 'https://www.tutorialspoint.com/artificial_intelligence/index.htm',
 'https://www.tutorialspoint.com/computer_programming/index.htm',
 'https://www.tutorialspoint.com/inter_process_communication/index.htm',
 'https://www.tutorialspoint.com/learn_c_by_examples/index.htm',
 'https://www.tutorialspoint.com/basics_of_computers/index.htm',
 'https://www.tutorialspoint.com/basics_of_computer_