In [1]:
import time
import requests
import pandas as pd
from datetime import date
from bs4 import BeautifulSoup
from collections import defaultdict

In [2]:
class PhdBot:
    def __init__(self, keyword=None, sleep: int = 2) -> None:
        #self.url = '{}&PG={}' if keyword is None else '{}Keywords={}&PG={}'
        self.url = 'https://www.findaphd.com/phds/?PG={}'
        self.opportunities = defaultdict(list)
        self.keyword = None
        self.sleep = sleep
    
    def generate_opportunities(self):
        page = 1
        while True:
            if self.keyword is None:
                url = self.url.format(str(page))
            else:
                url = self.url.format(self.keyword, str(page))
            mydivs = self._parseURL(url)
            if len(mydivs) <= 4:
                time.sleep(60)
                mydivs = self._parseURL(url)
                if len(mydivs) <= 4:
                    break
            for divs in mydivs:
                if divs is not None:
                    title = self._getTitle(divs)
                    if title is None:
                        continue
                    self.opportunities['Title'].append(title)
                    self.opportunities['Description'].append(self._getDescription(divs))
                    self.opportunities['URL'].append(self._getread_more(divs))
                    self.opportunities['Deadline'].append(self._getDeadline(divs))
                    self.opportunities['SearchDate'].append(date.today())
                    usedKeys = self._addAttributes(divs)
                    for missing in self._validKeys - usedKeys:
                        self.opportunities[key].append(None)
            page += 1
            time.sleep(self.sleep)
        print(f'Checked {page} pages.')
        self.opportunities = pd.DataFrame(self.opportunities)
        print(f'Found {len(self.opportunities)} results.')
        return self.opportunities.drop_duplicates()
 

    def _addAttributes(self, divs):
        usedKeys = set()
        prefix = 'DataLayerManager.dynamic'
        for line in str(divs).split('\n'):
            if prefix in line.strip():
                key, val = line.strip().removeprefix(prefix).split('=')
                val = val.strip('; "')
                key = key.strip()
                if ',' in val:
                    val = tuple(sorted(val.split(',')))
                self.opportunities[key].append(val)
                usedKeys.add(key)
        return usedKeys

    #This is the informtion we are trying to retrive from the HTML website
    @property
    def _validKeys(self):
        return set([
            'DisciplineIds', 'DisciplineNames', 'SubjectIds',
            'SubjectNames', 'LocationCountryName', 'LocationCityName',
            'ProgrammeTypes', 'IId', 'InstitutionName', 'DId',
            'DepartmentName', 'FundingTypes'])
    
    
    def _parseURL(self, url):
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET',
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Max-Age': '3600',
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        }
        req = requests.get(url, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        result_class = 'resultsRow'
        return soup.find_all("div", class_=result_class)

    
    def _getTitle(self, divs):
        out = divs.find('h3') 
        if out is not None:
            return out.get_text().strip()
        
        
    def _getread_more(self,divs):
        out = divs.find('a', class_='phd-result__description--read-more') 
        if out is not None:
            return 'https://www.findaphd.com' + out.get('href')
    
    
    def _getDescription(self, divs):
        out = divs.find('div', class_='descFrag')
        if out is not None:
            return out.get_text().strip()
        
        
    def _getDeadline(self, divs):
        class_ = ('hoverTitle subButton badge text-wrap badge-light '
                  'card-badge p-2 m-1 font-weight-light')
        out = divs.find('a', class_=class_)
        if out is not None:
            deadline = out.get_text().strip()
            if deadline == 'Year round applications':
                return None
            else:
                return deadline

In [3]:
a = PhdBot()

In [8]:
pwd

'/Users/user/Library/CloudStorage/OneDrive-Nexus365/PhD/Internship/NLP_projects'

In [9]:
opportunities = a.generate_opportunities()
opportunities.to_pickle(f'../Data/{date.today()}.pkl')

Checked 1 pages.
Found 0 results.


In [None]:
path = '../Data/PhD_opportunities_24_10_2022.tsv'
opportunities.to_csv(path, sep='\t', header=True)

In [None]:
opportunities = pd.read_pickle('../Data/2022-10-24.pkl')

In [None]:
index = 'URL'
assert opportunities[index].value_counts().max() == 1

In [None]:
discipline = opportunities[[index, 'DisciplineNames']].explode('DisciplineNames')
discipline.to_csv('../Data/PhD_opportunities_24_10_2022_discipline.tsv',sep='\t',header = True)

In [None]:
subject_name = opportunities[[index, 'SubjectNames']].explode('SubjectNames')
subject_name.to_csv('../Data/PhD_opportunities_24_10_2022_subject_name.tsv',sep='\t',header = True)

In [None]:
opportunities['LocationCityName'].explode().value_counts().sort_values()

In [None]:
opportunities['DepartmentName'].explode().value_counts().sort_values() 

In [None]:
opportunities.groupby('LocationCountryName')['DisciplineNames'].apply(
    lambda x: x.explode().value_counts().sort_values().tail(1))

In [None]:
opportunities.groupby('LocationCountryName')['FundingTypes'].apply(
    lambda x: x.explode().value_counts().sort_values().tail(1))

In [None]:
opportunities.groupby('SubjectNames')['FundingTypes'].apply(
    lambda x: x.explode().value_counts().sort_values())

In [None]:
keys = []
for line in str(opportunities).split('\n'):
    if 'DataLayerManager' in line.strip():
        key, val = line.strip().removeprefix('DataLayerManager.dynamic').split('=')
        val = val.strip('; "')
        if ',' in val:
            val = tuple(sorted(val.split(',')))
        keys.append(key)

In [None]:
g = opportunities.find('a', class_='hoverTitle subButton badge text-wrap badge-light card-badge p-2 m-1 font-weight-light').get_text().strip()

In [None]:
g

In [None]:
g.find(class_='fas fa-calendar fa-icon')

In [None]:
opportunities.to_csv('../Data/PhD_opportunities_17_10_2022.csv',sep=',',header = True)

In [None]:
for opportunity in opportunities:
   print(opportunity.title)

In [None]:
df = pd.DataFrame(opportunities)
df.to_csv('../Data/PhD_opportunities_17_10_2022.tsv',sep='\t',header = True)

In [None]:
df

In [None]:
new_list = []
for opp in opportunities:
  new_list.append([opp.title,opp.university,opp.description,opp.fund])
df = pd.DataFrame(new_list)

In [None]:
opp.university

In [None]:
df = df.rename(columns={0:'PhD_Title',1:'Úniversity_Name',2:'Description',3:'funding_information'})

In [None]:
a = (1,2)
print(type(a))

In [None]:
df

In [None]:
df.to_csv('../Data/PhD_opportunities_17_10_2022.csv')

In [None]:
df.describe()

In [None]:
type(df)