# Assignment 1: Web Scraping

In [2]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Scraping Fast, Lahore website to get Faculty Information

In [132]:
lhr_facultyURL = 'https://lhr.nu.edu.pk/faculty/'
r = requests.get(lhr_facultyURL)

In [133]:
soup = BeautifulSoup(r.content, 'html.parser')

Getting all the information about the faculty of Fast School of Computing

In [134]:
fsc_info = soup.find('div', id= 'fsc')

Getting the department name

In [135]:
fsc_department = fsc_info.find(class_='mb-2 mt-3').text

Getting list of all faculty members

In [136]:
fsc_faculty_list = fsc_info.findAll(class_='col-lg-3 col-md-4 col-sm-6 col-12')

Creating Dataframe for storing faculty members' data of Fast, lahore

In [225]:
lhr_faculty = pd.DataFrame(columns=['Id', 'Name', 'Designation', 'HEC Approved PhD Supervisor', 'Highest Education', 'Email', 'Department', 'Extension', 'ImageURL'])

In [226]:
for member in fsc_faculty_list:
    profileURL = 'https://lhr.nu.edu.pk' + member.find(class_='faculty-link').get('href')
    member_id = int(profileURL.split('/')[-1])
        
    # Sending req to profileURL
    profileURL_res = requests.get(profileURL)
    
    profileURL_soup = BeautifulSoup(profileURL_res.content, 'html.parser')
    
    # Getting the latest education of the faculty member
    degree = profileURL_soup.find(class_='col-lg-8 col-md-6 col-sm-12 text-justify')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
        
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # Getting phone tag to get Ext from it
    phone_tag = profileURL_soup.find('span', class_='small')

    if phone_tag is not None:
        phone_tag_text = phone_tag.text
        if phone_tag_text:
            # Getting the digits after Ext:
            extension_match = re.search(r'Ext:(\d+)$', phone_tag_text)
            if extension_match:
                extension_number = extension_match.group(1)
        else:
            extension_number = None        
    else:
        extension_number = None
    
    # Checking the designation and whether the member is HEC approved Phd advisor or not
    text = member.find('p', class_='small text-center font-italic').text.strip()

    # text is splitted on the basis of \n removing extra spaces between them
    parts = [part.strip() for part in text.split("\n") if part.strip()]

    # first part which is the designation
    designation = parts[0]

    # Checking HEC approval
    hec_approved = False
    if len(parts) > 1 and "HEC Approved PhD Supervisor" in parts[1]:
        hec_approved = True

    new_data = {'Id': member_id,
                'Name': member.find(class_='text-center').text,
                'Designation': designation,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find(class_='mb-0 text-center').text,
                'Department': fsc_department,
                'Extension': extension_number,
                'ImageURL': 'https://lhr.nu.edu.pk' + member.find('img', class_='card-img-top rounded-circle mt-3 mb-0 d-block mx-auto').get('src')}
    
    new_df = pd.DataFrame([new_data])
    lhr_faculty = pd.concat([lhr_faculty, new_df], ignore_index=True)

Getting all the information about the faculty of department of Electrical Engineering

In [227]:
ee_info = soup.find('div', id= 'ee')

Getting department name

In [228]:
ee_department = ee_info.find(class_='mb-2 mt-3').text

Getting list of all faculty members

In [229]:
ee_faculty_list = ee_info.findAll(class_='col-lg-3 col-md-4 col-sm-6 col-12')

In [230]:
for member in ee_faculty_list:
    profileURL = 'https://lhr.nu.edu.pk' + member.find(class_='faculty-link').get('href')
    member_id = int(profileURL.split('/')[-1])
    
    # Sending req to profileURL
    profileURL_res = requests.get(profileURL)
    
    profileURL_soup = BeautifulSoup(profileURL_res.content, 'html.parser')
    
    # Getting the latest education of the faculty member
    degree = profileURL_soup.find(class_='col-lg-8 col-md-6 col-sm-12 text-justify')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
        
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # Getting phone tag to get Ext from it
    phone_tag = profileURL_soup.find('span', class_='small')

    if phone_tag is not None:
        phone_tag_text = phone_tag.text
        if phone_tag_text:
            # Getting the digits after Ext:
            extension_match = re.search(r'Ext:(\d+)$', phone_tag_text)
            if extension_match:
                extension_number = extension_match.group(1)
        else:
            extension_number = None        
    else:
        extension_number = None    
    
    # Checking the designation and whether the member is HEC approved Phd advisor or not
    text = member.find('p', class_='small text-center font-italic').text.strip()

    # text is splitted on the basis of \n removing extra spaces between them
    parts = [part.strip() for part in text.split("\n") if part.strip()]

    # first part which is the designation
    designation = parts[0]

    # Checking HEC approval
    hec_approved = False
    if len(parts) > 1 and "HEC Approved PhD Supervisor" in parts[1]:
        hec_approved = True

    new_data = {'Id': member_id,
                'Name': member.find(class_='text-center').text,
                'Designation': designation,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find(class_='mb-0 text-center').text,
                'Department': ee_department,
                'Extension': extension_number,
                'ImageURL': 'https://lhr.nu.edu.pk' + member.find('img', class_='card-img-top rounded-circle mt-3 mb-0 d-block mx-auto').get('src')}
    
    new_df = pd.DataFrame([new_data])
    lhr_faculty = pd.concat([lhr_faculty, new_df], ignore_index=True)

Getting all the information about the faculty of department of Civil Engineering

In [231]:
cv_info = soup.find('div', id= 'cv')

Getting department name

In [232]:
cv_department = cv_info.find(class_='mb-2 mt-3').text

Getting list of all faculty members

In [233]:
cv_faculty_list = cv_info.findAll(class_='col-lg-3 col-md-4 col-sm-6 col-12')

In [234]:
for member in cv_faculty_list:
    profileURL = 'https://lhr.nu.edu.pk' + member.find(class_='faculty-link').get('href')
    member_id = int(profileURL.split('/')[-1])
    
    # Sending req to profileURL
    profileURL_res = requests.get(profileURL)
    
    profileURL_soup = BeautifulSoup(profileURL_res.content, 'html.parser')
    
    # Getting the latest education of the faculty member
    degree = profileURL_soup.find(class_='col-lg-8 col-md-6 col-sm-12 text-justify')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
        
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # Getting phone tag to get Ext from it
    phone_tag = profileURL_soup.find('span', class_='small')

    if phone_tag is not None:
        phone_tag_text = phone_tag.text
        if phone_tag_text:
            # Getting the digits after Ext:
            extension_match = re.search(r'Ext:(\d+)$', phone_tag_text)
            if extension_match:
                extension_number = extension_match.group(1)
        else:
            extension_number = None        
    else:
        extension_number = None
        
    # Checking the designation and whether the member is HEC approved Phd advisor or not
    text = member.find('p', class_='small text-center font-italic').text.strip()

    # text is splitted on the basis of \n removing extra spaces between them
    parts = [part.strip() for part in text.split("\n") if part.strip()]

    # first part which is the designation
    designation = parts[0]

    # Checking HEC approval
    hec_approved = False
    if len(parts) > 1 and "HEC Approved PhD Supervisor" in parts[1]:
        hec_approved = True

    new_data = {'Id': member_id,
                'Name': member.find(class_='text-center').text,
                'Designation': designation,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find(class_='mb-0 text-center').text,
                'Department': cv_department,
                'Extension': extension_number,
                'ImageURL': 'https://lhr.nu.edu.pk' + member.find('img', class_='card-img-top rounded-circle mt-3 mb-0 d-block mx-auto').get('src')}
    
    new_df = pd.DataFrame([new_data])
    lhr_faculty = pd.concat([lhr_faculty, new_df], ignore_index=True)

Getting all the information about the faculty of Fast School of Management

In [235]:
fsm_info = soup.find('div', id= 'fsm')

Getting department name

In [236]:
fsm_department = fsm_info.find(class_='mb-2 mt-3').text

Getting list of all faculty members

In [237]:
fsm_faculty_list = fsm_info.findAll(class_='col-lg-3 col-md-4 col-sm-6 col-12')

In [238]:
for member in fsm_faculty_list:
    profileURL = 'https://lhr.nu.edu.pk' + member.find(class_='faculty-link').get('href')
    member_id = int(profileURL.split('/')[-1])
    
    # Sending req to profileURL
    profileURL_res = requests.get(profileURL)
    
    profileURL_soup = BeautifulSoup(profileURL_res.content, 'html.parser')
    
    # Getting the latest education of the faculty member
    degree = profileURL_soup.find(class_='col-lg-8 col-md-6 col-sm-12 text-justify')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
        
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # Getting phone tag to get Ext from it
    phone_tag = profileURL_soup.find('span', class_='small')

    if phone_tag is not None:
        phone_tag_text = phone_tag.text
        if phone_tag_text:
            # Getting the digits after Ext:
            extension_match = re.search(r'Ext:(\d+)$', phone_tag_text)
            if extension_match:
                extension_number = extension_match.group(1)
        else:
            extension_number = None        
    else:
        extension_number = None
        
    # Checking the designation and whether the member is HEC approved Phd advisor or not
    text = member.find('p', class_='small text-center font-italic').text.strip()
    
    # text is splitted on the basis of \n removing extra spaces between them
    parts = [part.strip() for part in text.split("\n") if part.strip()]

    # first part which is the designation
    designation = parts[0]

    # Checking HEC approval
    hec_approved = False
    if len(parts) > 1 and "HEC Approved PhD Supervisor" in parts[1]:
        hec_approved = True

    new_data = {'Id': member_id,
                'Name': member.find(class_='text-center').text,
                'Designation': designation,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find(class_='mb-0 text-center').text,
                'Department': fsm_department,
                'Extension': extension_number,
                'ImageURL': 'https://lhr.nu.edu.pk' + member.find('img', class_='card-img-top rounded-circle mt-3 mb-0 d-block mx-auto').get('src')}
    
    new_df = pd.DataFrame([new_data])
    lhr_faculty = pd.concat([lhr_faculty, new_df], ignore_index=True)

Getting all the information about the faculty of Department of Science and Humanities

In [239]:
ss_info = soup.find('div', id= 'ss')

Getting department name

In [240]:
ss_department = ss_info.find(class_='mb-2 mt-3').text

Getting list of all faculty members

In [241]:
ss_faculty_list = ss_info.findAll(class_='col-lg-3 col-md-4 col-sm-6 col-12')

In [242]:
for member in ss_faculty_list:
    profileURL = 'https://lhr.nu.edu.pk' + member.find(class_='faculty-link').get('href')
    member_id = int(profileURL.split('/')[-1])
    
    # Sending req to profileURL
    profileURL_res = requests.get(profileURL)
    
    profileURL_soup = BeautifulSoup(profileURL_res.content, 'html.parser')
    
    # Getting the latest education of the faculty member
    degree = profileURL_soup.find(class_='col-lg-8 col-md-6 col-sm-12 text-justify')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
        
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # Getting phone tag to get Ext from it
    phone_tag = profileURL_soup.find('span', class_='small')

    if phone_tag is not None:
        phone_tag_text = phone_tag.text
        if phone_tag_text:
            # Getting the digits after Ext:
            extension_match = re.search(r'Ext:(\d+)$', phone_tag_text)
            if extension_match:
                extension_number = extension_match.group(1)
        else:
            extension_number = None        
    else:
        extension_number = None
        
    # Checking the designation and whether the member is HEC approved Phd advisor or not
    text = member.find('p', class_='small text-center font-italic').text.strip()
    
    # text is splitted on the basis of \n removing extra spaces between them
    parts = [part.strip() for part in text.split("\n") if part.strip()]

    # first part which is the designation
    designation = parts[0]

    # Checking HEC approval
    hec_approved = False
    if len(parts) > 1 and "HEC Approved PhD Supervisor" in parts[1]:
        hec_approved = True

    new_data = {'Id': member_id,
                'Name': member.find(class_='text-center').text,
                'Designation': designation,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find(class_='mb-0 text-center').text,
                'Department': ss_department,
                'Extension': extension_number,
                'ImageURL': 'https://lhr.nu.edu.pk' + member.find('img', class_='card-img-top rounded-circle mt-3 mb-0 d-block mx-auto').get('src')}
    
    new_df = pd.DataFrame([new_data])
    lhr_faculty = pd.concat([lhr_faculty, new_df], ignore_index=True)

In [243]:
# Changing type of the columns
lhr_faculty[['Id', 'Extension']] = lhr_faculty[['Id', 'Extension']].astype(int)
lhr_faculty['HEC Approved PhD Supervisor'] = lhr_faculty['HEC Approved PhD Supervisor'].astype(bool)
lhr_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']] = lhr_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']].astype('string')

In [244]:
lhr_faculty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Id                           195 non-null    int32 
 1   Name                         195 non-null    string
 2   Designation                  195 non-null    string
 3   HEC Approved PhD Supervisor  195 non-null    bool  
 4   Highest Education            195 non-null    string
 5   Email                        195 non-null    string
 6   Department                   195 non-null    string
 7   Extension                    195 non-null    int32 
 8   ImageURL                     195 non-null    string
dtypes: bool(1), int32(2), string(6)
memory usage: 11.0 KB


In [245]:
lhr_faculty['Department'].value_counts()

Department
FAST School of Computing Faculty                89
FAST School of Management Faculty               32
Department of Science & Humanities Faculty      30
Department of Electrical Engineering Faculty    23
Department of Civil Engineering Faculty         21
Name: count, dtype: Int64

In [246]:
lhr_faculty.sample(4)

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
131,6571,Ms. Maham Akram,Lab Engineer,False,BS (Civil Engineering),maham.akram@nu.edu.pk,Department of Civil Engineering Faculty,447,https://lhr.nu.edu.pk/media/Faculty/90_-_Maham...
122,6839,Mr. Asim Sharif,Lecturer,False,MS (Civil Engineering),asim.sharif@nu.edu.pk,Department of Civil Engineering Faculty,446,https://lhr.nu.edu.pk/media/Faculty/6839.png
16,4335,Dr. Asma Ahmad Farhan,Assistant Professor,True,PhD (CS),asma.ahmad@nu.edu.pk,FAST School of Computing Faculty,212,https://lhr.nu.edu.pk/media/Faculty/54._Dr_Asm...
25,5466,Dr. Zeeshan Ali Rana,Assistant Professor,True,Ph.D (CS),zeeshan.rana@nu.edu.pk,FAST School of Computing Faculty,562,https://lhr.nu.edu.pk/media/Faculty/66_Dr_Zees...


Converting into csv file

In [159]:
lhr_faculty.to_csv('lhr.csv')

# Scraping Fast, Islamabad website to get Faculty Information

In [160]:
import json

Computing Faculty

In [161]:
isb_cs_faculty_URL = 'http://isb.nu.edu.pk/Faculty/GetAllEmp?id=301'

In [162]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

# making req to get info about cs faculty
cs_res = requests.get(isb_cs_faculty_URL, headers=headers)

# printing res
print(cs_res)

<Response [200]>


In [163]:
# converting response from json into dictionary
cs_faculty_info = json.loads(cs_res.content)

In [164]:
isb_faculty_info = []

In [165]:
# storing cs faculty info
for info in cs_faculty_info:
    isb_faculty_info.append({
      'Id': info.get('Emp_ID'),
      'Name': info.get('Name'),
      'Designation': info.get('Designation_Title'),
      'HEC Approved PhD Supervisor': info.get('hec_supervisor'),
      'Highest Education': None,
      'Email': info.get('Email'),
      'Department': info.get('Dept_ID'),
      'Extension': info.get('Extension'),
      'ImageURL': info.get('ImagePath')
    })

Electrical Faculty

In [166]:
isb_ee_faculty_URL = 'http://isb.nu.edu.pk/Faculty/GetAllEmp?id=302'

# making req to get info about ee faculty
ee_res = requests.get(isb_ee_faculty_URL, headers=headers)
print(ee_res)

<Response [200]>


In [167]:
# converting response in json into dictionary
ee_faculty_info = json.loads(ee_res.content)

In [168]:
# storing ee faculty info
for info in ee_faculty_info:
    isb_faculty_info.append({
      'Id': info.get('Emp_ID'),
      'Name': info.get('Name'),
      'Designation': info.get('Designation_Title'),
      'HEC Approved PhD Supervisor': info.get('hec_supervisor'),
      'Highest Education': None,
      'Email': info.get('Email'),
      'Department': info.get('Dept_ID'),
      'Extension': info.get('Extension'),
      'ImageURL': info.get('ImagePath')
    })

Management Faculty

In [169]:
isb_mgmt_faculty_URL = 'http://isb.nu.edu.pk/Faculty/GetAllEmp?id=303'

# making req to get info about mgmt faculty
mgmt_res = requests.get(isb_mgmt_faculty_URL, headers=headers)
print(mgmt_res)

<Response [200]>


In [170]:
# converting response in json into dictionary
mgmt_faculty_info = json.loads(mgmt_res.content)

In [171]:
# storing mgmt faculty info
for info in mgmt_faculty_info:
    isb_faculty_info.append({
      'Id': info.get('Emp_ID'),
      'Name': info.get('Name'),
      'Designation': info.get('Designation_Title'),
      'HEC Approved PhD Supervisor': info.get('hec_supervisor'),
      'Highest Education': None,
      'Email': info.get('Email'),
      'Department': info.get('Dept_ID'),
      'Extension': info.get('Extension'),
      'ImageURL': info.get('ImagePath')
    })

Science and Humanities Faculty

In [172]:
isb_ss_faculty_URL = 'http://isb.nu.edu.pk/Faculty/GetAllEmp?id=313'

# making req to get info about ss faculty
ss_res = requests.get(isb_ss_faculty_URL, headers=headers)
print(ss_res)

<Response [200]>


In [173]:
# converting response in json into dictionary
ss_faculty_info = json.loads(ss_res.content)

In [174]:
# storing ss faculty info
for info in ss_faculty_info:
    isb_faculty_info.append({
      'Id': info.get('Emp_ID'),
      'Name': info.get('Name'),
      'Designation': info.get('Designation_Title'),
      'HEC Approved PhD Supervisor': info.get('hec_supervisor'),
      'Highest Education': None,
      'Email': info.get('Email'),
      'Department': info.get('Dept_ID'),
      'Extension': info.get('Extension'),
      'ImageURL': info.get('ImagePath')
    })

Converting into Dataframe

In [175]:
isb_faculty = pd.DataFrame(isb_faculty_info)

In [176]:
# chaging extension datatype into int
isb_faculty['Extension'] = pd.to_numeric(isb_faculty['Extension'], errors='coerce').fillna(0).astype(int)

In [177]:
# changing id datatype into int
isb_faculty['Id'] = isb_faculty['Id'].astype(int)

In [178]:
# chaging datatypes of Name, desg, email, dept, imageURL into string
isb_faculty['Name'] = isb_faculty['Name'].astype('string')
isb_faculty['Designation'] = isb_faculty['Designation'].astype('string')
isb_faculty['Highest Education'] = isb_faculty['Highest Education'].astype('string')
isb_faculty['Email'] = isb_faculty['Email'].astype('string')
isb_faculty['Department'] = isb_faculty['Department'].astype('string')
isb_faculty['ImageURL'] = isb_faculty['ImageURL'].astype('string')

In [179]:
# chaginging datatype of HEC Approved PhD Supervisor into bool
isb_faculty['HEC Approved PhD Supervisor'] = isb_faculty['HEC Approved PhD Supervisor'].astype(bool)

In [180]:
# removing unwanted spaces from name
isb_faculty['Name'] = isb_faculty['Name'].str.strip()

In [181]:
depts_list = []

for idx, dept in isb_faculty['Department'].items():
    if dept == '301':
        depts_list.append('CS')
    elif dept == '302':
        depts_list.append('EE')
    elif dept == '303':
        depts_list.append('MG')
    elif dept == '313':
        depts_list.append('SS')

In [182]:
isb_faculty['Department'] = depts_list

Accessing education

In [183]:
edu_detail_URL = 'http://isb.nu.edu.pk/Faculty/Details1'

In [184]:
degree_list = []

In [185]:
for idx, ids in isb_faculty['Id'].items():
    data = {'id': int(ids)}
    myres = requests.post(edu_detail_URL, headers=headers, json=data)

    temp = json.loads(myres.content)
    if 'listEEdu' in temp and len(temp['listEEdu']) > 0:
        degree_list.append(temp['listEEdu'][0]['Degree_Name'])
    else:
        degree_list.append('None')

In [186]:
isb_faculty['Highest Education'] = degree_list

In [187]:
isb_faculty.head(5)

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
0,4553,Waseem Shahzad,Director & Professor,True,Ph.D,waseem.shahzad@nu.edu.pk,CS,101,/Images/Profile/CS/4553-3.jpg
1,4551,Hasan Mujtaba Kiyani,"Head, School of Computing & Professor",True,Ph.D,hasan.mujtaba@nu.edu.pk,CS,626,/Images/Profile/CS/4551.jpg
2,4400,Hammad Majeed,HoD (Computer Science) & Professor,True,P.hD,hammad.majeed@nu.edu.pk,CS,672,/Images/Profile/CS/4400.jpg
3,5099,Muhammad Asim,HoD (Cyber Security) & Professor,True,P.hD,muhammad.asim@nu.edu.pk,CS,503,/Images/Profile/CS/5099.jpg
4,5200,Usman Habib,HoD (Software Engineering) & Associate Professor,True,PhD,usman.habib@nu.edu.pk,CS,0,/Images/Profile/CS/5200.jpg


In [188]:
isb_faculty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Id                           240 non-null    int32 
 1   Name                         240 non-null    string
 2   Designation                  240 non-null    string
 3   HEC Approved PhD Supervisor  240 non-null    bool  
 4   Highest Education            240 non-null    object
 5   Email                        240 non-null    string
 6   Department                   240 non-null    object
 7   Extension                    240 non-null    int32 
 8   ImageURL                     237 non-null    string
dtypes: bool(1), int32(2), object(2), string(4)
memory usage: 13.5+ KB


Converting into csv file

In [189]:
isb_faculty.to_csv('isb.csv')

# Scraping Fast, Peshawar website to get Faculty Information

In [190]:
psw_URL = 'http://pwr.nu.edu.pk/'
psw_res = requests.get(psw_URL)

In [191]:
psw_soup = BeautifulSoup(psw_res.content, 'html.parser')

In [192]:
psw_faculty_href = []

In [193]:
# loop through all <a> tags and extract the 'href' attribute
for link in psw_soup.findAll('a'):
    href = link.get('href')
    if href == 'cs-faculty/':
        psw_faculty_href.append(psw_URL + href)
    
    elif href == 'ee-faculty/':
        psw_faculty_href.append(psw_URL + href)
        
    elif href == 'lab-technician-ee-staff/':
        psw_faculty_href.append(psw_URL + href)
    
    elif href == "sh-faculty/":
        psw_faculty_href.append(psw_URL + href)   
        
    elif href == 'hec-faculty/':
        psw_faculty_href.append(psw_URL + href)    

In [194]:
# assigning each link to a variable
if len(psw_faculty_href) == 5:
    psw_cs_faculty_href, psw_ee_faculty_href, psw_ee_lab_faculty_href, psw_sh_faculty_href, psw_hec_faculty_link = psw_faculty_href

In [357]:
# dataframe to store psw faculty data
psw_faculty = pd.DataFrame(columns=['Id', 'Name', 'Designation', 'HEC Approved PhD Supervisor', 'Highest Education', 'Email', 'Department', 'Extension', 'ImageURL'])

HEC approved supervisor list

In [248]:
psw_hec_faculty_res = requests.get(psw_hec_faculty_link)
psw_hec_faculty_res

<Response [200]>

In [249]:
psw_hec_faculty_soup = BeautifulSoup(psw_hec_faculty_res.content, 'html.parser')

In [250]:
psw_hec_faculty_allDivs = psw_hec_faculty_soup.findAll('div', class_='faculty-member')

In [251]:
psw_hec_faculty_list = []

In [252]:
for faculty_member in psw_hec_faculty_allDivs:
    name = faculty_member.find('div', class_='faculty-details').find('h2').text.strip()
    psw_hec_faculty_list.append(name)

Computing Faculty

In [253]:
psw_cs_href_res = requests.get(psw_cs_faculty_href)

In [254]:
psw_cs_soup = BeautifulSoup(psw_cs_href_res.content, 'html.parser')

In [255]:
psw_cs_faculty_list = psw_cs_soup.find_all('div', class_='faculty-member')

In [358]:
for member in psw_cs_faculty_list:
    # id of member
    input_str = member.find('h2').find('a').get('href')
    # Regular expression to extract the `id` value
    match = re.search(r"faculty-profile\.php\?id=(\d+)", input_str)
    if match:
        # ID value
        member_id = match.group(1)
        
    else:
        member_id = None
        
    # name of member    
    name = member.find('h2').text
    
    # designation, email, extension str
    p_list = member.find_all('p')
    if len(p_list) == 3:
        designation_name, email, extension_str = p_list
    
    designation = designation_name.text
    email = email.text
    extension_str = extension_str.text
    
    # Regular expression to find extension number
    match = re.search(r'Ext\.\s*(\d+)', extension_str)
    # If there's a match, extract the number, else set to None
    extension_number = match.group(1) if match else None
    
    # hec_approved or not
    if name in psw_hec_faculty_list:
        hec_approved = True
    else:
        hec_approved = False   
    
    # finding department name
    match = re.search(r'/(\w+)-faculty', psw_cs_faculty_href)

    if match:
        department = match.group(1)
    
    # image
    imageURL = psw_URL + member.find('img').get('src')
    
    # getting highest educaton
    psw_cs_faculty_profile_res = requests.get(psw_URL + input_str)
    psw_cs_soup2 = BeautifulSoup(psw_cs_faculty_profile_res.content, 'html.parser')
    edu_str = psw_cs_soup2.find('div', class_='tab-content').find(class_='tab-pane').find('li').text.strip()
    
    match = re.match(r'^([^,]+)', edu_str)
    if match:
        highest_education = match.group(1)
    else:
        highest_education = None
    
    new_data = {'Id': member_id,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    psw_faculty = pd.concat([psw_faculty, new_df], ignore_index=True)

Electrical Faculty

In [257]:
psw_ee_href_res = requests.get(psw_ee_faculty_href)
psw_ee_href_res

<Response [200]>

In [258]:
psw_ee_soup = BeautifulSoup(psw_ee_href_res.content, 'html.parser')

In [259]:
psw_ee_faculty_list = psw_ee_soup.find_all('div', class_='faculty-member')

In [359]:
for member in psw_ee_faculty_list:
    # id of member
    input_str = member.find('h2').find('a').get('href')
    # Regular expression to extract the `id` value
    match = re.search(r"faculty-profile\.php\?id=(\d+)", input_str)
    if match:
        # ID value
        member_id = match.group(1)
        
    else:
        member_id = None
        
    # name of member    
    name = member.find('h2').text
    
    # designation, email, extension str
    p_list = member.find_all('p')
    if len(p_list) == 3:
        designation_name, email, extension_str = p_list
    
    designation = designation_name.text
    email = email.text
    extension_str = extension_str.text
    
    # Regular expression to find extension number
    match = re.search(r'Ext\.\s*(\d+)', extension_str)
    # If there's a match, extract the number, else set to None
    extension_number = match.group(1) if match else None
    
    # hec_approved or not
    if name in psw_hec_faculty_list:
        hec_approved = True
    else:
        hec_approved = False   
    
    # department name
    # Use regex to find 'cs' followed by a hyphen and any other characters
    match = re.search(r'/(\w+)-faculty', psw_ee_faculty_href)

    if match:
        department = match.group(1)
    
    # image
    imageURL = psw_URL + member.find('img').get('src')
    
    # getting highest educaton
    psw_cs_faculty_profile_res = requests.get(psw_URL + input_str)
    psw_cs_soup2 = BeautifulSoup(psw_cs_faculty_profile_res.content, 'html.parser')
    edu_str = psw_cs_soup2.find('div', class_='tab-content').find(class_='tab-pane').find('li').text.strip()
    
    match = re.match(r'^([^,]+)', edu_str)
    if match:
        highest_education = match.group(1)
    else:
        highest_education = None
    
    new_data = {'Id': member_id,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    psw_faculty = pd.concat([psw_faculty, new_df], ignore_index=True)

In [360]:
psw_faculty

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
0,3704,Fazl-e-Basit,Assistant Professor & HoD,False,M.S(Computer Science),fazl.basit@nu.edu.pk,cs,110.0,http://pwr.nu.edu.pk/images/faculty/thumb3704.jpg
1,9237,Dr. Omar Usman Khan,Professor & Director,True,Ph.D. (Computer And Control Engineering),omar.khan@nu.edu.pk,cs,103.0,http://pwr.nu.edu.pk/images/faculty/thumb9237.jpg
2,9315,Dr. Hafeez Ur Rehman,Associate Professor (On Leave),True,Ph.D. in Computer and Control Engineering,hafeez.urrehman@nu.edu.pk,cs,,http://pwr.nu.edu.pk/images/faculty/thumb9315.jpg
3,9482,Dr. Nouman Azam,Associate Professor,True,Ph.D. (CS),nouman.azam@nu.edu.pk,cs,134.0,http://pwr.nu.edu.pk/images/faculty/thumb9482.jpg
4,6833,Dr. Hafeez Anwar,Associate Professor,False,PhD (Informatics),hafeez.anwar@nu.edu.pk,cs,,http://pwr.nu.edu.pk/images/faculty/thumb6833.jpg
5,3692,Shoaib Muhammad Khan,Assistant Professor,False,M.S(Computer Science),shoaib.khan@nu.edu.pk,cs,144.0,http://pwr.nu.edu.pk/images/faculty/thumb3692.jpg
6,9348,Dr. Taimoor Khan,Assistant Professor (On Leave),True,Ph.D. (Computer Science),taimoor.khan@nu.edu.pk,cs,,http://pwr.nu.edu.pk/images/faculty/thumb9348.jpg
7,5436,Dr. Bilal Khan,Assistant Professor,True,PhD (CE),khan.bilal@nu.edu.pk,cs,,http://pwr.nu.edu.pk/images/faculty/thumb5436.jpg
8,4564,Dr. Muhammad Amin,Assistant Professor,False,PhD (Computer Science),muhammad.amin@nu.edu.pk,cs,127.0,http://pwr.nu.edu.pk/images/faculty/thumb4564.jpg
9,4918,Dr. Waqas Ali,Assistant Professor,False,PhD(Computer Science),waqas.ali@nu.edu.pk,cs,107.0,http://pwr.nu.edu.pk/images/faculty/thumb4918.jpg


Electrical Lab Technician

In [262]:
psw_ee_lab_href_res = requests.get(psw_ee_lab_faculty_href)
psw_ee_lab_href_res

<Response [200]>

In [263]:
psw_ee_lab_soup = BeautifulSoup(psw_ee_lab_href_res.content, 'html.parser')

In [264]:
psw_ee_lab_technician_div = psw_ee_lab_soup.find('div', class_='row mtli-row-clearfix')

In [265]:
psw_ee_lab_technician_list = psw_ee_lab_technician_div.findAll('div', class_='col-xs-12 col-sm-6 col-md-3 sm-text-center mb-30 mb-sm-30')

In [361]:
for member in psw_ee_lab_technician_list:
    # name of member    
    name = member.find('h4', class_='text-uppercase font-raleway font-weight-600 m-0').text
    
    # designation, email, extension str
    designation = member.findAll(class_='text-theme-color')[0].text
    
    extension_str = member.findAll(class_='text-theme-color')[-1].text
    # Regular expression to find extension number
    match = re.search(r'Ext\.\s*(\d+)', extension_str)
    # If there's a match, extract the number, else set to None
    extension_number = match.group(1) if match else None
     
    # department name
    match = re.search(r'/(\w+)-faculty', psw_ee_lab_faculty_href)
    if match:
        department = match.group(1)
    
    # image
    imageURL = psw_URL + member.find('img', class_='img-fullwidth').get('src')
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': None,
            'Highest Education': None,
            'Email': None,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    psw_faculty = pd.concat([psw_faculty, new_df], ignore_index=True)

Science and Humanites Department

In [267]:
psw_sh_href_res = requests.get(psw_sh_faculty_href)
psw_sh_href_res

<Response [200]>

In [268]:
psw_sh_soup = BeautifulSoup(psw_sh_href_res.content, 'html.parser')

In [269]:
psw_sh_faculty_list = psw_sh_soup.find_all('div', class_='faculty-member')

In [362]:
for member in psw_sh_faculty_list:
    # id of member
    input_str = member.find('h2').find('a').get('href')
    # Regular expression to extract the `id` value
    match = re.search(r"faculty-profile\.php\?id=(\d+)", input_str)
    if match:
        # ID value
        member_id = match.group(1)
        
    else:
        member_id = None
        
    # name of member    
    name = member.find('h2').text
    
    # designation, email, extension str
    p_list = member.find_all('p')
    if len(p_list) == 3:
        designation_name, email, extension_str = p_list
    
    designation = designation_name.text
    email = email.text
    extension_str = extension_str.text
    
    # Regular expression to find extension number
    match = re.search(r'Ext\.\s*(\d+)', extension_str)
    # If there's a match, extract the number, else set to None
    extension_number = match.group(1) if match else None
    
    # hec_approved or not
    if name in psw_hec_faculty_list:
        hec_approved = True
    else:
        hec_approved = False   
    
    # department name
    # Use regex to find 'cs' followed by a hyphen and any other characters
    match = re.search(r'/(\w+)-faculty', psw_sh_faculty_href)

    if match:
        department = match.group(1)
    
    # image
    imageURL  = psw_URL + member.find('img').get('src')
    
    # getting highest educaton
    psw_cs_faculty_profile_res = requests.get(psw_URL + input_str)
    psw_cs_soup2 = BeautifulSoup(psw_cs_faculty_profile_res.content, 'html.parser')
    edu_str = psw_cs_soup2.find('div', class_='tab-content').find(class_='tab-pane').find('li').text.strip()
    
    match = re.match(r'^([^,]+)', edu_str)
    if match:
        highest_education = match.group(1)
    else:
        highest_education = None
    
    new_data = {'Id': member_id,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    psw_faculty = pd.concat([psw_faculty, new_df], ignore_index=True)

In [363]:
# Changing type of the columns
psw_faculty[['Id', 'Extension']] = psw_faculty[['Id', 'Extension']].fillna(0).astype(int)
psw_faculty['HEC Approved PhD Supervisor'] = psw_faculty['HEC Approved PhD Supervisor'].astype(bool)
psw_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']] = psw_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']].astype('string')

In [364]:
psw_faculty.head(5)

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
0,3704,Fazl-e-Basit,Assistant Professor & HoD,False,M.S(Computer Science),fazl.basit@nu.edu.pk,cs,110,http://pwr.nu.edu.pk/images/faculty/thumb3704.jpg
1,9237,Dr. Omar Usman Khan,Professor & Director,True,Ph.D. (Computer And Control Engineering),omar.khan@nu.edu.pk,cs,103,http://pwr.nu.edu.pk/images/faculty/thumb9237.jpg
2,9315,Dr. Hafeez Ur Rehman,Associate Professor (On Leave),True,Ph.D. in Computer and Control Engineering,hafeez.urrehman@nu.edu.pk,cs,0,http://pwr.nu.edu.pk/images/faculty/thumb9315.jpg
3,9482,Dr. Nouman Azam,Associate Professor,True,Ph.D. (CS),nouman.azam@nu.edu.pk,cs,134,http://pwr.nu.edu.pk/images/faculty/thumb9482.jpg
4,6833,Dr. Hafeez Anwar,Associate Professor,False,PhD (Informatics),hafeez.anwar@nu.edu.pk,cs,0,http://pwr.nu.edu.pk/images/faculty/thumb6833.jpg


In [365]:
psw_faculty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Id                           45 non-null     int32 
 1   Name                         45 non-null     string
 2   Designation                  45 non-null     string
 3   HEC Approved PhD Supervisor  45 non-null     bool  
 4   Highest Education            41 non-null     string
 5   Email                        41 non-null     string
 6   Department                   45 non-null     string
 7   Extension                    45 non-null     int32 
 8   ImageURL                     45 non-null     string
dtypes: bool(1), int32(2), string(6)
memory usage: 2.6 KB


Converting into CSV file

In [222]:
psw_faculty.to_csv('psw.csv')

# Scraping Fast, Karachi website to get Faculty Information

In [278]:
# URLs for each department
khi_cs_faculty_href =  "https://khi.nu.edu.pk/faculty-php/"
khi_cyber_faculty_href = "https://khi.nu.edu.pk/department-of-cyber-security/"
khi_ai_faculty_href = "https://khi.nu.edu.pk/department-of-artificial-intelligence/"
khi_se_faculty_href = "https://khi.nu.edu.pk/department-of-software-engineering/"
khi_ee_faculty_href = "https://khi.nu.edu.pk/department-of-electrical-engineering/"
khi_mgs_faculty_href = "https://khi.nu.edu.pk/department-of-management-sciences/"
khi_sh_faculty_href = "https://khi.nu.edu.pk/department-of-sciences-humanities/"

In [279]:
# Datafrme to store kharachi faculty data
khi_faculty = pd.DataFrame(columns=['Id', 'Name', 'Designation', 'HEC Approved PhD Supervisor', 'Highest Education', 'Email', 'Department', 'Extension', 'ImageURL'])

Department of Computer Science Faculty

In [280]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_cs_faculty_href_res = requests.get(khi_cs_faculty_href, headers=headers)
khi_cs_faculty_href_res

<Response [200]>

In [281]:
khi_cs_soup = BeautifulSoup(khi_cs_faculty_href_res.content, 'html.parser')

In [282]:
khi_cs_faculty_sections = khi_cs_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [283]:
for member in khi_cs_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_cs_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

Department of Cyber Security Faculty

In [284]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_cyber_faculty_href_res = requests.get(khi_cyber_faculty_href, headers=headers)
khi_cyber_faculty_href_res

<Response [200]>

In [285]:
khi_cyber_soup = BeautifulSoup(khi_cyber_faculty_href_res.content, 'html.parser')

In [286]:
khi_cyber_faculty_sections = khi_cyber_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [287]:
for member in khi_cyber_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_cyber_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

Department of AI Faculty

In [288]:
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_ai_faculty_href_res = requests.get(khi_ai_faculty_href, headers=headers)
khi_ai_faculty_href_res

<Response [200]>

In [289]:
khi_ai_soup = BeautifulSoup(khi_ai_faculty_href_res.content, 'html.parser')

In [290]:
khi_ai_faculty_sections = khi_ai_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [291]:
for member in khi_ai_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_ai_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

Department of SE Faculty

In [292]:
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_se_faculty_href_res = requests.get(khi_se_faculty_href, headers=headers)
khi_se_faculty_href_res

<Response [200]>

In [293]:
khi_se_soup = BeautifulSoup(khi_se_faculty_href_res.content, 'html.parser')

In [294]:
khi_se_faculty_sections = khi_se_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [295]:
for member in khi_se_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_se_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

Department of EE Faculty

In [296]:
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_ee_faculty_href_res = requests.get(khi_ee_faculty_href, headers=headers)
khi_ee_faculty_href_res

<Response [200]>

In [297]:
khi_ee_soup = BeautifulSoup(khi_ee_faculty_href_res.content, 'html.parser')

In [298]:
khi_ee_faculty_sections = khi_ee_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [299]:
for member in khi_ee_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_ee_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

Department of Managemnet Science Faculty

In [300]:
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_mgs_faculty_href_res = requests.get(khi_mgs_faculty_href, headers=headers)
khi_mgs_faculty_href_res

<Response [200]>

In [301]:
khi_mgs_soup = BeautifulSoup(khi_mgs_faculty_href_res.content, 'html.parser')

In [302]:
khi_mgs_faculty_sections = khi_mgs_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [303]:
for member in khi_mgs_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_mgs_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

Department of Science and Humanities Faculty

In [304]:
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
khi_sh_faculty_href_res = requests.get(khi_sh_faculty_href, headers=headers)
khi_sh_faculty_href_res

<Response [200]>

In [305]:
khi_sh_soup = BeautifulSoup(khi_sh_faculty_href_res.content, 'html.parser')

In [306]:
khi_sh_faculty_sections = khi_sh_soup.find_all('div', class_='gdlr-core-personnel-list-column')

In [307]:
for member in khi_sh_faculty_sections:
    # Extracting the faculty details
    name = member.find('h3', class_='gdlr-core-personnel-list-title').get_text(strip=True)
    designation = member.find('div', class_='gdlr-core-personnel-list-position').get_text(strip=True)
    email = member.find('div', class_='kingster-type-email').get_text(strip=True)
    extension_number = member.find('div', class_='kingster-type-phone').get_text(strip=True) if member.find('div', class_='kingster-type-phone') else None
    imageURL = member.find('img')['src']
    faculty_url = member.find('a')['href']
    
    # Finding department
    department = khi_sh_soup.find('h1', class_='kingster-page-title').text
    
    # Check for PhD Supervisor from faculty details (assuming further detail page can have this info)
    hec_approved = 'HEC Approved PhD Supervisor' in requests.get(faculty_url).text

    # Visit individual faculty profile to get highest education
    response3 = requests.get(faculty_url)
    soup3 = BeautifulSoup(response3.content, "html.parser")
    educationMainDiv = soup3.find('div', class_='gdlr-core-pbf-wrapper-container')\
        .find('div', class_='gdlr-core-column-40')\
        .find('div', class_='gdlr-core-pbf-column-content')\
        .findAll('div', class_='gdlr-core-pbf-element')

    highest_education = None

    # Iterate through the education all possible elements
    for k in range(0, len(educationMainDiv), 1):
        highestEduMain = educationMainDiv[k].find('div', class_='gdlr-core-item-pdlr')
        if highestEduMain is not None:
            highest_education = highestEduMain.find('ul')
            if highest_education is not None:
                highest_education = highest_education.find('li')
                if highest_education is not None:
                    highest_education = highest_education.find('div', class_='gdlr-core-icon-list-content-wrap')
                    if highest_education is not None:
                        # Extract the highest education text
                        highest_education = highest_education.find('span').text.strip()
                        break
                    else:
                        highest_education = highestEduMain.find('span', class_='gdlr-core-title-item-caption')
            
            # Check if education is available
            educationAvailable = highestEduMain.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
            if educationAvailable:
                educationAvailable = educationAvailable.text.strip()

            # Exit loop if highest education is found and available
            if highest_education is not None and k > 1 and educationAvailable is not None:
                highest_education = highest_education.strip()
                break
            else:
                highest_education = None
                continue
    
    new_data = {'Id': None,
            'Name': name,
            'Designation': designation,
            'HEC Approved PhD Supervisor': hec_approved,
            'Highest Education': highest_education,
            'Email': email,
            'Department': department,
            'Extension': extension_number,
            'ImageURL': imageURL}
    
    new_df = pd.DataFrame([new_data])
    khi_faculty = pd.concat([khi_faculty, new_df], ignore_index=True)

In [308]:
# Replace non-numeric values with NaN, then fill NaN with 0, and convert to int
khi_faculty[['Id', 'Extension']] = (
    khi_faculty[['Id', 'Extension']]
    .replace('-', pd.NA)
    .fillna(0)
    .infer_objects(copy=False)   # Explicitly infer types after fillna
    .astype(int)
)

# Other conversions
khi_faculty['HEC Approved PhD Supervisor'] = khi_faculty['HEC Approved PhD Supervisor'].astype(bool)
khi_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']] = (
    khi_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']].astype('string')
)

  .fillna(0)


In [309]:
khi_faculty.head(5)

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
0,0,"Prof. Dr. Zulfiqar Ali Memon , PhD",Professor & Director,True,"PhD (Vrije University) Amsterdam, The Netherlands",zulfiqar.memon@nu.edu.pk,Department of Computer Science,213,https://khi.nu.edu.pk/wp-content/uploads/2023/...
1,0,"Dr. Muhammad Atif Tahir, PhD",Professor & Head of School (FSC),False,Deep Learning,atif.tahir@nu.edu.pk,Department of Computer Science,214,https://khi.nu.edu.pk/wp-content/uploads/2023/...
2,0,"Dr. Jawwad A. Shamsi, PhD",Professor & Dean,False,,jawwad.shamsi@nu.edu.pk,Department of Computer Science,218,https://khi.nu.edu.pk/wp-content/uploads/2023/...
3,0,"Dr. Ghufran Ahmed, PhD",Professor,False,"PostDoc (UK), PhD (CS)",ghufran.ahmed@nu.edu.pk,Department of Computer Science,233,https://khi.nu.edu.pk/wp-content/uploads/2023/...
4,0,"Dr. Nouman Durrani, PhD",Associate Professor,False,,muhammad.nouman@nu.edu.pk,Department of Computer Science,133,https://khi.nu.edu.pk/wp-content/uploads/2023/...


In [310]:
khi_faculty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Id                           147 non-null    int32 
 1   Name                         147 non-null    string
 2   Designation                  147 non-null    string
 3   HEC Approved PhD Supervisor  147 non-null    bool  
 4   Highest Education            100 non-null    string
 5   Email                        147 non-null    string
 6   Department                   147 non-null    string
 7   Extension                    147 non-null    int32 
 8   ImageURL                     147 non-null    string
dtypes: bool(1), int32(2), string(6)
memory usage: 8.3 KB


Converting into CSV file

In [130]:
khi_faculty.to_csv('khi.csv')

# Scraping Fast, Chiniot-Faislabad website to get Faculty Information

In [9]:
cfd_depts_URL = 'https://cfd.nu.edu.pk/all-departments/'
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}
cfd_res = requests.get(cfd_depts_URL, headers=headers)

In [10]:
cfd_depts_info = BeautifulSoup(cfd_res.content, 'html.parser')

In [11]:
# getting all divs havings links of each department
all_depts = cfd_depts_info.find('div', class_='kc-elm kc-css-633007 kc_row kc_row_inner')

In [13]:
# finding link tags for each department
all_depts_link_tags = all_depts.find_all('a')

In [14]:
href_links = []

# iterating each a tag in order to get link for each department
for dept in all_depts_link_tags:
    href = dept.get('href')
    # storing the href we get inside a list named as href_links
    href_links.append(href)

In [15]:
# checking whether href_links contain or not; if they exist store each link to respective variables
if len(href_links) == 6:
    cfd_cs_dept_link, cfd_se_dept_link, cfd_ai_dept_link, cfd_ee_dept_link, cfd_sh_dept_link, cfd_fsm_dept_link = href_links   
else:
    print('Error, the department links are not 4')

In [311]:
# dataframe to store cfd faculty data
cfd_faculty = pd.DataFrame(columns=['Id', 'Name', 'Designation', 'HEC Approved PhD Supervisor', 'Highest Education', 'Email', 'Department', 'Extension', 'ImageURL'])

CS Department

In [312]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

cfd_cs_dept_res = requests.get(cfd_cs_dept_link, headers=headers)

In [313]:
cfd_cs_dept = BeautifulSoup(cfd_cs_dept_res.content, 'html.parser')

In [314]:
cfd_cs_faculty_info = cfd_cs_dept.findAll('div', class_='unitech-teacher')

In [315]:
for member in cfd_cs_faculty_info:
    
    # checking whether supervisor is hec approved or not
    hec_element = member.find('ul', class_='unitech-teacher-social-icon').find('p', class_='hec')
    if hec_element:
        hec_approved = True
    else:
        hec_approved = False
    
    # getting department info using parsing
    match = re.search(r'(cs)', cfd_cs_dept_link)
    if match:
        department = match.group(1)
    else:
        department = None
        
    # extracting faculity url    
    cfd_cs_faculty_info_URL = member.find('div', class_='unitech-teacher__thumb').find('a').get('href')
    headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
    }
    # recieving response after requesting faculty url
    cfd_cs_faculty_info_res = requests.get(cfd_cs_faculty_info_URL, headers= headers)
    
    cfd_cs_faculty_info_soup = BeautifulSoup(cfd_cs_faculty_info_res.content, 'html.parser')
    
    degree = cfd_cs_faculty_info_soup.find('div', class_='htc__skill__container progress__bar--2')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
    
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # extracting extension number information
    extension_link =  cfd_cs_faculty_info_soup.find('ul',class_='teacher__address')
    list_items = extension_link.find_all('li')
    ext_text = [li.get_text(strip=True) for li in list_items if 'Ext:' in li.get_text(strip=True)]
    text = [chr.replace('Ext:', '').strip() for chr in ext_text]
    extension_number = ' | '.join(text) if text else None
                            
    new_data = {'Id': None,
                'Name': member.find(class_='unitech-teacher__inner').find('h4').text.strip(),
                'Designation': member.find(class_='unitech-teacher__inner').find('h6').text,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find('ul', class_='unitech-teacher-social-icon').find('p').text,
                'Department': department,
                'Extension': extension_number,
                'ImageURL': member.find('div', class_='unitech-teacher__thumb').find('img').get('src')}
    
    new_df = pd.DataFrame([new_data])
    cfd_faculty = pd.concat([cfd_faculty, new_df], ignore_index=True)

Software Engineering Department

In [316]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

cfd_se_dept_res = requests.get(cfd_se_dept_link, headers=headers)

In [317]:
cfd_se_dept = BeautifulSoup(cfd_se_dept_res.content, 'html.parser')

In [318]:
cfd_se_faculty_info = cfd_se_dept.findAll('div', class_='unitech-teacher')

In [319]:
for member in cfd_se_faculty_info:
    
    # checking whether supervisor is hec approved or not
    hec_element = member.find('ul', class_='unitech-teacher-social-icon').find('p', class_='hec')
    if hec_element:
        hec_approved = True
    else:
        hec_approved = False
    
    # getting department info using parsing
    match = re.search(r'(se)', cfd_se_dept_link)
    if match:
        department = match.group(1)
    else:
        department = None
        
    # extracting faculity url    
    cfd_se_faculty_info_URL = member.find('div', class_='unitech-teacher__thumb').find('a').get('href')
    headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
    }
    # recieving response after requesting faculty url
    cfd_se_faculty_info_res = requests.get(cfd_se_faculty_info_URL, headers= headers)
    
    cfd_se_faculty_info_soup = BeautifulSoup(cfd_se_faculty_info_res.content, 'html.parser')
    
    degree = cfd_se_faculty_info_soup.find('div', class_='htc__skill__container progress__bar--2')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
    
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # extracting extension number information
    extension_link =  cfd_se_faculty_info_soup.find('ul',class_='teacher__address')
    list_items = extension_link.find_all('li')
    ext_text = [li.get_text(strip=True) for li in list_items if 'Ext:' in li.get_text(strip=True)]
    text = [chr.replace('Ext:', '').strip() for chr in ext_text]
    extension_number = ' | '.join(text) if text else None
                            
    new_data = {'Id': None,
                'Name': member.find(class_='unitech-teacher__inner').find('h4').text.strip(),
                'Designation': member.find(class_='unitech-teacher__inner').find('h6').text,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find('ul', class_='unitech-teacher-social-icon').find('p').text,
                'Department': department,
                'Extension': extension_number,
                'ImageURL': member.find('div', class_='unitech-teacher__thumb').find('img').get('src')}
    
    new_df = pd.DataFrame([new_data])
    cfd_faculty = pd.concat([cfd_faculty, new_df], ignore_index=True)

AI Department

In [320]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

cfd_ai_dept_res = requests.get(cfd_ai_dept_link, headers=headers)

In [321]:
cfd_ai_dept = BeautifulSoup(cfd_ai_dept_res.content, 'html.parser')

In [322]:
cfd_ai_faculty_info = cfd_ai_dept.findAll('div', class_='unitech-teacher')

In [323]:
for member in cfd_ai_faculty_info:
    
    # checking whether supervisor is hec approved or not
    hec_element = member.find('ul', class_='unitech-teacher-social-icon').find('p', class_='hec')
    if hec_element:
        hec_approved = True
    else:
        hec_approved = False
    
    # getting department info using parsing
    match = re.search(r'(ai)', cfd_ai_dept_link)
    if match:
        department = match.group(1)
    else:
        department = None
        
    # extracting faculity url    
    cfd_ai_faculty_info_URL = member.find('div', class_='unitech-teacher__thumb').find('a').get('href')
    headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
    }
    # recieving response after requesting faculty url
    cfd_ai_faculty_info_res = requests.get(cfd_ai_faculty_info_URL, headers= headers)
    
    cfd_ai_faculty_info_soup = BeautifulSoup(cfd_ai_faculty_info_res.content, 'html.parser')
    
    degree = cfd_ai_faculty_info_soup.find('div', class_='htc__skill__container progress__bar--2')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
    
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # extracting extension number information
    extension_link =  cfd_ai_faculty_info_soup.find('ul',class_='teacher__address')
    list_items = extension_link.find_all('li')
    ext_text = [li.get_text(strip=True) for li in list_items if 'Ext:' in li.get_text(strip=True)]
    text = [chr.replace('Ext:', '').strip() for chr in ext_text]
    extension_number = ' | '.join(text) if text else None
                            
    new_data = {'Id': None,
                'Name': member.find(class_='unitech-teacher__inner').find('h4').text.strip(),
                'Designation': member.find(class_='unitech-teacher__inner').find('h6').text,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find('ul', class_='unitech-teacher-social-icon').find('p').text,
                'Department': department,
                'Extension': extension_number,
                'ImageURL': member.find('div', class_='unitech-teacher__thumb').find('img').get('src')}
    
    new_df = pd.DataFrame([new_data])
    cfd_faculty = pd.concat([cfd_faculty, new_df], ignore_index=True)

Electrical Department

In [324]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

cfd_ee_dept_res = requests.get(cfd_ee_dept_link, headers=headers)

In [325]:
cfd_ee_dept = BeautifulSoup(cfd_ee_dept_res.content, 'html.parser')

In [326]:
cfd_ee_faculty_info = cfd_ee_dept.findAll('div', class_='unitech-teacher')

In [327]:
for member in cfd_ee_faculty_info:
    
    # checking whether supervisor is hec approved or not
    hec_element = member.find('ul', class_='unitech-teacher-social-icon').find('p', class_='hec')
    if hec_element:
        hec_approved = True
    else:
        hec_approved = False
    
    # getting department info using parsing
    match = re.search(r'(ee)', cfd_ee_dept_link)
    if match:
        department = match.group(1)
    else:
        department = None
        
    # extracting faculity url    
    cfd_ee_faculty_info_URL = member.find('div', class_='unitech-teacher__thumb').find('a').get('href')
    headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
    }
    # recieving response after requesting faculty url
    cfd_ee_faculty_info_res = requests.get(cfd_ee_faculty_info_URL, headers= headers)
    
    cfd_ee_faculty_info_soup = BeautifulSoup(cfd_ee_faculty_info_res.content, 'html.parser')
    
    degree = cfd_ee_faculty_info_soup.find('div', class_='htc__skill__container progress__bar--2')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
    
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # extracting extension number information
    extension_link =  cfd_ee_faculty_info_soup.find('ul',class_='teacher__address')
    list_items = extension_link.find_all('li')
    ext_text = [li.get_text(strip=True) for li in list_items if 'Ext:' in li.get_text(strip=True)]
    text = [chr.replace('Ext:', '').strip() for chr in ext_text]
    extension_number = ' | '.join(text) if text else None
                            
    new_data = {'Id': None,
                'Name': member.find(class_='unitech-teacher__inner').find('h4').text.strip(),
                'Designation': member.find(class_='unitech-teacher__inner').find('h6').text,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find('ul', class_='unitech-teacher-social-icon').find('p').text,
                'Department': department,
                'Extension': extension_number,
                'ImageURL': member.find('div', class_='unitech-teacher__thumb').find('img').get('src')}
    
    new_df = pd.DataFrame([new_data])
    cfd_faculty = pd.concat([cfd_faculty, new_df], ignore_index=True)

Science and Humanities Department

In [328]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

cfd_sh_dept_res = requests.get(cfd_sh_dept_link, headers=headers)

In [329]:
cfd_sh_dept = BeautifulSoup(cfd_sh_dept_res.content, 'html.parser')

In [330]:
cfd_sh_faculty_info = cfd_sh_dept.findAll('div', class_='unitech-teacher')

In [331]:
for member in cfd_sh_faculty_info:
    
    # checking whether supervisor is hec approved or not
    hec_element = member.find('ul', class_='unitech-teacher-social-icon').find('p', class_='hec')
    if hec_element:
        hec_approved = True
    else:
        hec_approved = False
    
    # getting department info using parsing
    match = re.search(r'(sh)', cfd_sh_dept_link)
    if match:
        department = match.group(1)
    else:
        department = None
        
    # extracting faculity url    
    cfd_sh_faculty_info_URL = member.find('div', class_='unitech-teacher__thumb').find('a').get('href')
    headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
    }
    # recieving response after requesting faculty url
    cfd_sh_faculty_info_res = requests.get(cfd_sh_faculty_info_URL, headers= headers)
    
    cfd_sh_faculty_info_soup = BeautifulSoup(cfd_sh_faculty_info_res.content, 'html.parser')
    
    degree = cfd_sh_faculty_info_soup.find('div', class_='htc__skill__container progress__bar--2')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
    
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # extracting extension number information
    extension_link =  cfd_sh_faculty_info_soup.find('ul',class_='teacher__address')
    list_items = extension_link.find_all('li')
    ext_text = [li.get_text(strip=True) for li in list_items if 'Ext:' in li.get_text(strip=True)]
    text = [chr.replace('Ext:', '').strip() for chr in ext_text]
    extension_number = ' | '.join(text) if text else None
                            
    new_data = {'Id': None,
                'Name': member.find(class_='unitech-teacher__inner').find('h4').text.strip(),
                'Designation': member.find(class_='unitech-teacher__inner').find('h6').text,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find('ul', class_='unitech-teacher-social-icon').find('p').text,
                'Department': department,
                'Extension': extension_number,
                'ImageURL': member.find('div', class_='unitech-teacher__thumb').find('img').get('src')}
    
    new_df = pd.DataFrame([new_data])
    cfd_faculty = pd.concat([cfd_faculty, new_df], ignore_index=True)

Management Department

In [332]:
# define header to mimic a chrome browser
headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
}

cfd_fsm_dept_res = requests.get(cfd_fsm_dept_link, headers=headers)

In [333]:
cfd_fsm_dept = BeautifulSoup(cfd_fsm_dept_res.content, 'html.parser')

In [334]:
cfd_fsm_faculty_info = cfd_fsm_dept.findAll('div', class_='unitech-teacher')

In [335]:
for member in cfd_fsm_faculty_info:
    
    # checking whether supervisor is hec approved or not
    hec_element = member.find('ul', class_='unitech-teacher-social-icon').find('p', class_='hec')
    if hec_element:
        hec_approved = True
    else:
        hec_approved = False
    
    # getting department info using parsing
    match = re.search(r'(fsm)', cfd_fsm_dept_link)
    if match:
        department = match.group(1)
    else:
        department = None
        
    # extracting faculity url    
    cfd_fsm_faculty_info_URL = member.find('div', class_='unitech-teacher__thumb').find('a').get('href')
    headers = {
    'User-Agent': 'Chrome/92.0.4515.159 Safari/537.36'
    }
    # recieving response after requesting faculty url
    cfd_fsm_faculty_info_res = requests.get(cfd_fsm_faculty_info_URL, headers= headers)
    
    cfd_fsm_faculty_info_soup = BeautifulSoup(cfd_fsm_faculty_info_res.content, 'html.parser')
    
    degree = cfd_fsm_faculty_info_soup.find('div', class_='htc__skill__container progress__bar--2')
    if degree:
        # Attempt to find the first <li> element
        li_element = degree.find('li')
    
        if li_element:
            li_text = li_element.text.strip()
            
            # Getting the characters before first comma
            match = re.match(r'^[^,]+', li_text)
            if match:
                highest_education = match.group(0)  # Get the matched part
    
    # extracting extension number information
    extension_link =  cfd_fsm_faculty_info_soup.find('ul',class_='teacher__address')
    list_items = extension_link.find_all('li')
    ext_text = [li.get_text(strip=True) for li in list_items if 'Ext:' in li.get_text(strip=True)]
    text = [chr.replace('Ext:', '').strip() for chr in ext_text]
    extension_number = ' | '.join(text) if text else None
                            
    new_data = {'Id': None,
                'Name': member.find(class_='unitech-teacher__inner').find('h4').text.strip(),
                'Designation': member.find(class_='unitech-teacher__inner').find('h6').text,
                'HEC Approved PhD Supervisor': hec_approved,
                'Highest Education': highest_education,
                'Email': member.find('ul', class_='unitech-teacher-social-icon').find('p').text,
                'Department': department,
                'Extension': extension_number,
                'ImageURL': member.find('div', class_='unitech-teacher__thumb').find('img').get('src')}
    
    new_df = pd.DataFrame([new_data])
    cfd_faculty = pd.concat([cfd_faculty, new_df], ignore_index=True)

In [340]:
# Changing type of the columns
cfd_faculty[['Id', 'Extension']] = (
    cfd_faculty[['Id', 'Extension']].fillna(0).infer_objects().astype(int)
)
cfd_faculty['HEC Approved PhD Supervisor'] = cfd_faculty['HEC Approved PhD Supervisor'].astype(bool)
cfd_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']] = cfd_faculty[['Name', 'Designation', 'Highest Education', 'Email', 'Department', 'ImageURL']].astype('string')

In [344]:
cfd_faculty.sample(5)

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
18,0,Mr. Ali Hamza,Lecturer,False,M.S(Computer Science),ali.Hamza@nu.edu.pk,cs,122,https://cfd.nu.edu.pk/wp-content/uploads/2023/...
26,0,Mr. Masood Habib,Lecturer,False,M.S(Computer Science),m.habib@nu.edu.pk,cs,190,https://cfd.nu.edu.pk/wp-content/uploads/2021/...
45,0,Ms. Gul E,Instructor,False,M.S(Computer Science),Gul.Zahra@nu.edu.pk,cs,301,https://cfd.nu.edu.pk/wp-content/uploads/2024/...
27,0,Mr. Mazhar Hussain,Lecturer,False,M.S(Computer Science),mazhar.h@nu.edu.pk,cs,0,https://cfd.nu.edu.pk/wp-content/uploads/2019/...
108,0,Ms. Zainab Mubarik,Lecturer,False,MPhil. (Physics) 2018,zainab.mubarik@nu.edu.pk,sh,281,https://cfd.nu.edu.pk/wp-content/uploads/2023/...


In [342]:
cfd_faculty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Id                           121 non-null    int32 
 1   Name                         121 non-null    string
 2   Designation                  121 non-null    string
 3   HEC Approved PhD Supervisor  121 non-null    bool  
 4   Highest Education            121 non-null    string
 5   Email                        121 non-null    string
 6   Department                   121 non-null    string
 7   Extension                    121 non-null    int32 
 8   ImageURL                     121 non-null    string
dtypes: bool(1), int32(2), string(6)
memory usage: 6.9 KB


Converting into CSV file

In [91]:
cfd_faculty.to_csv('cfd.csv')

# Concatenating each dataframe to form faculty.csv

In [366]:
fast_faculty = pd.concat([lhr_faculty, isb_faculty, psw_faculty, khi_faculty, cfd_faculty], ignore_index=True)

In [369]:
fast_faculty

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
0,1238,Dr. Kashif Zafar,Professor & HOD,True,PhD (CS),kashif.zafar@nu.edu.pk,FAST School of Computing Faculty,569,https://lhr.nu.edu.pk/media/Faculty/01._Dr_Kas...
1,4027,Dr. Aamir Wali,Professor,True,PhD (CS),aamir.wali@nu.edu.pk,FAST School of Computing Faculty,572,https://lhr.nu.edu.pk/media/Faculty/51._Dr_Aam...
2,4391,Dr. Asif Mahmood Gillani,Professor,True,Ph.D(Computing),asif.gilani@nu.edu.pk,FAST School of Computing Faculty,558,https://lhr.nu.edu.pk/media/Faculty/02._Dr_Asi...
3,6113,Dr. Asma Naseer,Professor,True,PhD (CS),asma.naseer@nu.edu.pk,FAST School of Computing Faculty,634,https://lhr.nu.edu.pk/media/Faculty/07._Dr_Asm...
4,4329,Dr. Zareen Alamgir,Professor,True,Ph.D (CS),zareen.alamgir@nu.edu.pk,FAST School of Computing Faculty,612,https://lhr.nu.edu.pk/media/Faculty/03._Dr_Zar...
...,...,...,...,...,...,...,...,...,...
743,0,Mr. Syed Zeeshan,Assistant Professor,False,PhD Business Administration,zeeshan.syed@nu.edu.pk,fsm,266,https://cfd.nu.edu.pk/wp-content/uploads/2019/...
744,0,Mr. Ahmad Salah,Lecturer,False,PhD Business Administration,ahmad.salah@nu.edu.pk,fsm,256,https://cfd.nu.edu.pk/wp-content/uploads/2023/...
745,0,Ms. Amna Babar,Lecturer,False,PhD Business Administration,amna.Tirmizey@nu.edu.pk,fsm,0,https://cfd.nu.edu.pk/wp-content/uploads/2023/...
746,0,Mr. Hafiz Muhammad,Lecturer,False,MBA,Zeeshan.raza@nu.edu.pk,fsm,265,https://cfd.nu.edu.pk/wp-content/uploads/2019/...


In [385]:
fast_faculty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Id                           748 non-null    int32 
 1   Name                         748 non-null    string
 2   Designation                  748 non-null    string
 3   HEC Approved PhD Supervisor  748 non-null    bool  
 4   Highest Education            697 non-null    object
 5   Email                        744 non-null    string
 6   Department                   748 non-null    object
 7   Extension                    748 non-null    int32 
 8   ImageURL                     745 non-null    string
dtypes: bool(1), int32(2), object(2), string(4)
memory usage: 41.8+ KB


In [370]:
fast_faculty.to_csv('fast_faculty.csv')

# Forming sample.csv file

In [371]:
import pandas as pd

In [372]:
# loading faculty.csv
faculty = pd.read_csv('fast_faculty.csv')

In [383]:
faculty.drop(faculty.columns[0], axis=1, inplace=True)

In [392]:
faculty.sample(5)

Unnamed: 0,Id,Name,Designation,HEC Approved PhD Supervisor,Highest Education,Email,Department,Extension,ImageURL
291,6689,Atif Khurshid,Lecturer,False,MS,atif.khurshid@isb.nu.edu.pk\r\n,CS,0,/Images/Profile/CS/6689.jpg
292,6696,Mahnoor Tariq,Lecturer,False,MS,mahnoor.tariq@isb.nu.edu.pk\r\n,CS,0,/Images/Profile/CS/6696.jpg
290,6688,Muhammad Farrukh Bashir,Lecturer,False,MS,farrukh.bashir@isb.nu.edu.pk,CS,338,/Images/Profile/CS/6688.jpg
381,3813,Sadia Nadeem,Professor,True,PhD,sadia.nadeem@nu.edu.pk,MG,182,/Images/Profile/FSM/3813.jpg
448,5913,Sara Rehmat,Lecturer,False,MS (CS),sara.rehmat@nu.edu.pk,cs,147,http://pwr.nu.edu.pk/images/faculty/thumb5913.jpg


In [399]:
my_Rollno = 5257
my_Rollno_last_digit = 7

In [400]:
value = my_Rollno_last_digit / 10

In [402]:
sample_ = faculty.sample(frac= value)

In [404]:
sample_.to_csv('sample_.csv')