In [1]:
import time
import requests
import pandas as pd
from datetime import date
from bs4 import BeautifulSoup
from collections import defaultdict



In [2]:
class PhdBot:
    def __init__(self, keyword=None, sleep: int = 2) -> None:
        #self.url = '{}&PG={}' if keyword is None else '{}Keywords={}&PG={}'
        self.url = 'https://www.findaphd.com/phds/?PG={}'
        self.opportunities = defaultdict(list)
        self.keyword = None
        self.sleep = sleep
    
    def generate_opportunities(self):
        page = 1
        while True:
            if self.keyword is None:
                url = self.url.format(str(page))
            else:
                url = self.url.format(self.keyword, str(page))
            mydivs = self._parseURL(url)
            if len(mydivs) <= 4:
                time.sleep(60)
                mydivs = self._parseURL(url)
                if len(mydivs) <= 4:
                    break
            for divs in mydivs:
                if divs is not None:
                    title = self._getTitle(divs)
                    if title is None:
                        continue
                    self.opportunities['Title'].append(title)
                    self.opportunities['Description'].append(self._getDescription(divs))
                    self.opportunities['URL'].append(self._getread_more(divs))
                    self.opportunities['Deadline'].append(self._getDeadline(divs))
                    self.opportunities['SearchDate'].append(date.today())
                    usedKeys = self._addAttributes(divs)
                    for missing in self._validKeys - usedKeys:
                        self.opportunities[key].append(None)
            page += 1
            time.sleep(self.sleep)
        print(f'Checked {page} pages.')
        self.opportunities = pd.DataFrame(self.opportunities)
        print(f'Found {len(self.opportunities)} results.')
        return self.opportunities.drop_duplicates()
 

    def _addAttributes(self, divs):
        usedKeys = set()
        prefix = 'DataLayerManager.dynamic'
        for line in str(divs).split('\n'):
            if prefix in line.strip():
                key, val = line.strip().removeprefix(prefix).split('=')
                val = val.strip('; "')
                key = key.strip()
                if ',' in val:
                    val = tuple(sorted(val.split(',')))
                self.opportunities[key].append(val)
                usedKeys.add(key)
        return usedKeys

    #This is the informtion we are trying to retrive from the HTML website
    @property
    def _validKeys(self):
        return set([
            'DisciplineIds', 'DisciplineNames', 'SubjectIds',
            'SubjectNames', 'LocationCountryName', 'LocationCityName',
            'ProgrammeTypes', 'IId', 'InstitutionName', 'DId',
            'DepartmentName', 'FundingTypes'])
    
    
    def _parseURL(self, url):
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET',
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Max-Age': '3600',
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        }
        req = requests.get(url, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        result_class = 'resultsRow'
        return soup.find_all("div", class_=result_class)

    
    def _getTitle(self, divs):
        out = divs.find('h3') 
        if out is not None:
            return out.get_text().strip()
        
        
    def _getread_more(self,divs):
        out = divs.find('a', class_='phd-result__description--read-more') 
        if out is not None:
            return 'https://www.findaphd.com' + out.get('href')
    
    
    def _getDescription(self, divs):
        out = divs.find('div', class_='descFrag')
        if out is not None:
            return out.get_text().strip()
        
        
    def _getDeadline(self, divs):
        class_ = ('hoverTitle subButton badge text-wrap badge-light '
                  'card-badge p-2 m-1 font-weight-light')
        out = divs.find('a', class_=class_)
        if out is not None:
            deadline = out.get_text().strip()
            if deadline == 'Year round applications':
                return None
            else:
                return deadline

In [3]:
a = PhdBot()

In [4]:
opportunities = a.generate_opportunities()
opportunities.to_pickle(f'../Data/{date.today()}.pkl')

Checked 453 pages.
Found 6779 results.


In [25]:
path = '../Data/PhD_opportunities_24_10_2022.tsv'
opportunities.to_csv(path, sep='\t', header=True)

In [29]:
opportunities = pd.read_pickle('../Data/2022-10-24.pkl')

In [30]:
index = 'URL'
assert opportunities[index].value_counts().max() == 1

In [33]:
discipline = opportunities[[index, 'DisciplineNames']].explode('DisciplineNames')
discipline.to_csv('../Data/PhD_opportunities_24_10_2022_discipline.tsv',sep='\t',header = True)

In [34]:
subject_name = opportunities[[index, 'SubjectNames']].explode('SubjectNames')
subject_name.to_csv('../Data/PhD_opportunities_24_10_2022_subject_name.tsv',sep='\t',header = True)

In [6]:
opportunities['LocationCityName'].explode().value_counts().sort_values()

Prague           1
Bordeaux         1
Los Angeles      1
San Diego        1
Coleraine        1
              ... 
Leeds          189
Glasgow        207
Sheffield      487
London         507
Manchester     517
Name: LocationCityName, Length: 193, dtype: int64

In [7]:
opportunities['DepartmentName'].explode().value_counts().sort_values() 

School of Languages and Cultures (SLC)                                          1
EPSRC Centre for Doctoral Training in Compound Semiconductor Manufacturing      1
 Engineering and Computing                                                      1
Institute of Chemical Biology                                                   1
International Max Planck Research School for Molecular Organ Biology            1
                                                                             ... 
Department of Chemistry                                                       120
Department of Mechanical Engineering                                          202
Faculty of Biology                                                            297
 Medicine and Health                                                          297
                                                                              409
Name: DepartmentName, Length: 755, dtype: int64

In [8]:
opportunities.groupby('LocationCountryName')['DisciplineNames'].apply(
    lambda x: x.explode().value_counts().sort_values().tail(1))

LocationCountryName                        
Australia             Engineering                42
Austria               Biological Sciences         4
Belgium               Mathematics                 2
Canada                Engineering                 6
China                 Engineering                35
Czechia               Biological Sciences        14
Denmark               Engineering                 3
Estonia               Economics                   1
Finland               Economics                   1
France                Business & Management       4
Germany               Biological Sciences        36
Hong Kong             Engineering                 4
Hungary               Biological Sciences         2
Ireland               Engineering                39
Israel                Biological Sciences         8
Italy                 Engineering                 7
Japan                 Physics                     1
Lithuania             Biological Sciences         2
Macau               

In [9]:
opportunities.groupby('LocationCountryName')['FundingTypes'].apply(
    lambda x: x.explode().value_counts().sort_values().tail(1))

LocationCountryName     
Australia             UK     101
Austria               UK       4
Belgium               SF       2
Canada                UK       9
China                 UK      70
Czechia               SF      20
Denmark               UK       7
Estonia               UK       1
Finland               UK       2
France                UK       6
Germany               UK      63
Hong Kong             UK       9
Hungary               SF       2
Ireland               SF      73
Israel                UK       9
Italy                 UK      15
Japan                 UK       1
Lithuania             UK       2
Macau                 UK       1
Malaysia              UK       2
Netherlands           SF       5
New Zealand           SF      64
Norway                UK       5
Poland                UK      17
Singapore             UK       2
Slovenia              UK       3
South Korea           UK       2
Spain                 UK       2
Switzerland           UK       9
Taiwan            

In [213]:
opportunities.groupby('SubjectNames')['FundingTypes'].apply(
    lambda x: x.explode().value_counts().sort_values())

SubjectNames             
Accounting             SF      1
Aerospace Engineering  SF      1
                       UK      1
American Studies       EU      1
                       NE      1
                            ... 
Volcanology            UK      2
                       NE    114
                       EU    134
                       UK    150
                       SF    154
Name: FundingTypes, Length: 9149, dtype: int64

In [52]:
keys = []
for line in str(opportunities).split('\n'):
    if 'DataLayerManager' in line.strip():
        key, val = line.strip().removeprefix('DataLayerManager.dynamic').split('=')
        val = val.strip('; "')
        if ',' in val:
            val = tuple(sorted(val.split(',')))
        keys.append(key)

In [140]:
g = opportunities.find('a', class_='hoverTitle subButton badge text-wrap badge-light card-badge p-2 m-1 font-weight-light').get_text().strip()

AttributeError: 'DataFrame' object has no attribute 'find'

In [None]:
g

In [122]:
g.find(class_='fas fa-calendar fa-icon')

<i class="fas fa-calendar fa-icon"></i>

In [204]:
opportunities.to_csv('../Data/PhD_opportunities_17_10_2022.csv',sep=',',header = True)

In [216]:
for opportunity in opportunities:
   print(opportunity.title)



School of Management


The Roles of Transposable Elements as Oncogenic Regulators in Acute Myeloid Leukaemia

Novel methods for the rational development of molecular glues

The generation of new Penicillin variants

Fungal project: Biosynthetic pathway discovery and characterization

Exploring bacterial enzymes as new targets for treating infectious disease

Modelling of Resonant Acoustic Mixing Parameters

Probiotic Formulation for Dairy Application

Developing Methods for Big Data Capture in Support of the Digital Twin for Investment Casting Shelling Operations

3D Printing of Complex Dressings for the Regeneration of Diabetic Foot Ulcers

EASTBIO Linking pathogen diversity and dynamics to spatiotemporal metacommunity dynamics in a naturally fragmented landscape


Wellcome Trust 4-year PhD in Dynamic Molecular Cell Biology in Bristol


Clinically-motivated and physics-informed deep learning for contouring tumours using multimodal cancer imaging

PhD in Drosophila Behavioral Circuit

(BBSRC DTP) How does twitchy link to ciliary function in sperm cells

Development of amine-dehydrogenase and lyase biocatalysts for the sustainable manufacturing of unnatural chiral amino acids and amino alcohols
None
None
None
None

Mining the chemodiversity of the plant genus Myrica to reveal bioactive molecules for their medicinal uses

(Clinical) Development and validation of a radiosensitivity gene expression signature in prostate cancer

(BBSRC DTP) Epigenetic mechanisms linking maternal immune activation-induced transcriptional changes to adult behavioural impairment in a neurodevelopmental rat model

Understanding the immunomodulatory activity of pharmaceuticals in zebrafish using high content imaging and phenotypic profiling

(Non-Clinical) Development of PROTACs against the histone acetyl transferase MOZ/KAT6a for AMLs and other malignancies

Impact of pregnancy-specific glycoproteins on development and ageing

Nanoscale visualisation of bacteriocins in action on bacterial me


(MRC DTP) PROTEIN TRANSDUCTION TO CONTROL INFLAMMATION DURING TISSUE REPAIR AND REGENERATION


BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023



BBSRC EASTBIO Doctoral Training Partnership - Call for applications for 2023

None
None
None
None

(MRC 

In [214]:
df = pd.DataFrame(opportunities)
df.to_csv('../Data/PhD_opportunities_17_10_2022.tsv',sep='\t',header = True)

In [215]:
df

Unnamed: 0,Title,Deadline,SearchDate,DisciplineIds,DisciplineNames,SubjectIds,SubjectNames,LocationCountryName,LocationCityName,ProgrammeTypes,IId,InstitutionName,DId,DepartmentName,FundingTypes
0,Understanding of bubble-particle interactions ...,NaT,2022-10-19,12,Engineering,"(236, 246)","(Chemical Engineering, Fluid Mechanics)",United Kingdom,Birmingham,PhD Research Project,282,University of Birmingham,126,School of Chemical Engineering,"(EU, NE, SF, UK)"
1,QUT Digital Child Scholarship (PhD),NaT,2022-10-19,"(31, 7)","(Communication & Media Studies, Psychology)","(179, 454)","(Child Psychology, Digital Media)",Australia,Brisbane,PhD Research Project,636,Queensland University of Technology,7122,Australian Research Council Centre of Excellen...,"(EU, NE, SF, UK)"
2,Mechanochemistry with mechanical bonds – 3 stu...,2022-11-13,2022-10-19,6,Chemistry,"(170, 173, 177)","(Organic Chemistry, Other, Physical Chemistry)",United Kingdom,Manchester,PhD Research Project,332,The University of Manchester,1021,Department of Chemistry,"(EU, SF, UK)"
3,Meeting future net-zero carbon energy needs: I...,2022-11-21,2022-10-19,"(12, 13, 18, 24)","(Engineering, Environmental Sciences, Geology,...","(237, 245, 266, 303, 304, 305, 374)","(Civil Engineering, Environmental Engineering,...",United Kingdom,Glasgow,PhD Research Project,353,University of Strathclyde,3,Department of Civil & Environmental Engineering,"(EU, NE, SF, UK)"
4,Causal AI using EHR data for clinical decision...,NaT,2022-10-19,"(27, 8)","(Computer Science, Nursing & Health)","(186, 194, 199, 417)","(Artificial Intelligence, Health Informatics, ...",United Kingdom,London,PhD Research Project,194,Queen Mary University of London,1881,School of Physical and Chemical Sciences,"(EU, SF, UK)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5301,Institute for Design Innovation Research,NaT,2022-10-19,"(32, 7, 9)","(Communication & Media Studies, Creative Arts ...","(185, 216, 472, 473)","(Other, Other, Other, Social Work)",United Kingdom,London,Arts Research Programme,2122,Loughborough University London,,,"(EU, NE, SF, UK)"
5302,Institute for Media and Creative Industries Re...,NaT,2022-10-19,"(30, 7, 9)","(Communication & Media Studies, Creative Arts ...","(185, 216, 453)","(Other, Other, Other)",United Kingdom,London,Arts Research Programme,2122,Loughborough University London,,,"(EU, NE, SF, UK)"
5303,Institute for Sport Business Research,NaT,2022-10-19,"(10, 32, 33, 5)","(Business & Management, Economics, Sociology, ...","(163, 222, 470, 472, 473, 479)","(Gender Studies, Other, Other, Other, Other, S...",United Kingdom,London,Business Research Programme,2122,Loughborough University London,,,"(EU, NE, SF, UK)"
5304,The Institute for Sport and Physical Activity ...,NaT,2022-10-19,"(26, 27, 33)","(Medicine, Nursing & Health, Sport & Exercise ...","(395, 404, 417, 479)","(Epidemiology, Health Informatics, Other, Phys...",United Kingdom,Bedford,Social Sciences Research Programme,281,University of Bedfordshire,,,SF


In [261]:
new_list = []
for opp in opportunities:
  new_list.append([opp.title,opp.university,opp.description,opp.fund])
df = pd.DataFrame(new_list)

In [3]:
opp.university

NameError: name 'opp' is not defined

In [263]:
df = df.rename(columns={0:'PhD_Title',1:'Úniversity_Name',2:'Description',3:'funding_information'})

In [2]:
a = (1,2)
print(type(a))

<class 'tuple'>


In [264]:
df

Unnamed: 0,PhD_Title,Úniversity_Name,Description,funding_information
0,\n\nSchool of Management\n,"(University of Bath\n\nSchool of Management,)",The School of Management is one of the UK's le...,non-eu
1,\nThe Roles of Transposable Elements as Oncoge...,(Imperial College London\n\nDepartment of Haem...,We are looking for a talented and enthusiastic...,non-eu
2,\nNovel methods for the rational development o...,(University of Cambridge\n\nDepartment of Phar...,Applications are invited for 3.5-year PhD stud...,non-eu
3,\nThe generation of new Penicillin variants,(Victoria University of Wellington\n\nFerrier ...,A PhD scholarship is available to synthesize a...,non-eu
4,\nFungal project: Biosynthetic pathway discove...,(Victoria University of Wellington\n\nFerrier ...,The Ferrier Institute has established a world-...,non-eu
...,...,...,...,...
8704,\n\nPhDs at the University of Hradec Kralove\n,"(University of Hradec Kralove,)",A PUBLIC UNIVERSITY SINCE 1959!. UHK provides ...,uk
8705,,"(None,)",,uk
8706,,"(None,)",,uk
8707,,"(None,)",,uk


In [265]:
df.to_csv('../Data/PhD_opportunities_17_10_2022.csv')

In [266]:
df.describe()

Unnamed: 0,PhD_Title,Úniversity_Name,Description,funding_information
count,6869,8709,6869,8709
unique,2524,876,2400,3
top,\n\nBBSRC EASTBIO Doctoral Training Partnershi...,"(None,)","Talented and motivated students, passionate ab...",uk
freq,35,1840,110,3470


In [268]:
type(df)

pandas.core.frame.DataFrame