In [3]:
import requests
from bs4 import BeautifulSoup

# 要爬取的网址
url = 'https://ccr.cancer.gov/staff-directory?f%5B0%5D=position_type%3APrincipal%20Investigator&page=5'

# 发送 GET 请求获取网页内容
response = requests.get(url)

# 检查响应状态码
if response.status_code == 200:
    # 解析 HTML 内容
    html_content = response.text
    
    # 现在使用 BeautifulSoup 对 html_content 进行解析和提取所需信息
    soup = BeautifulSoup(html_content, 'html.parser')
    # 找到所有的<li>元素
    staff_list = soup.find_all('li', class_='grid-col-12 tablet:grid-col-6 margin-y-1')
    
else:
    print('Failed to retrieve HTML content. Status code:', response.status_code)


In [17]:
import pandas as pd
# 创建一个空列表来保存搜索结果
search_results = []



# 遍历每个<li>元素
for staff_item in staff_list:
    # 在<li>元素内找到<article>元素
    article = staff_item.find('article', class_='profile-teaser__wrapper')
    
    # 在<article>元素内找到姓名、职位和电子邮件信息
    name = article.find('span', class_='field--name-title').text.strip()
    position = article.find('div', class_='profile-teaser__position').text.strip()
    email = article.find('div', class_='field--name-field-staff-profile-email').text.strip()
    
    if email:
        email = email
    else:
        email = None
    
    
    # 在<article>元素内找到<a>标签，提取链接信息
    link = article.find('a')['href']
    # 将信息添加到搜索结果列表中
    search_results.append({'Name': name, 'Position': position, 'Email': email, 'Link': link})

# 将搜索结果转换为 DataFrame
df = pd.DataFrame(search_results)


In [18]:
###NOTICE


nciPI = df#只第一次网页的时候使用

In [None]:
###NOTICE



#将后面的网页内容合并上去，后几次用
nciPI = pd.concat([nciPI, df], ignore_index=True)

In [19]:
# 保存 DataFrame 到表格文件中，最后结果保存

nciPI.to_csv('nci_PI_list.csv', index=False)

# 整理名单

In [20]:
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('nci_PI_list.csv')

# 删除重复行，保留第一个出现的重复行
df.drop_duplicates(keep='first', inplace=True)

# 将结果写入新的 CSV 文件
df.to_csv('nci_PI_list.csv', index=False)


In [21]:
#将链接补全，注意别重复运行
prefix = "https://ccr.cancer.gov"
df["Link"] = df["Link"].apply(lambda x: prefix + x)
# 将结果写入新的 CSV 文件
df.to_csv('nci_PI_list.csv', index=False)

In [23]:
#遍历名单中的每个链接，然后访问链接对应的页面并提取"Areas of Expertise"的内容
import pandas as pd
import requests
from bs4 import BeautifulSoup

# name_list是一个DataFrame
name_list = pd.read_csv('nci_PI_list.csv')

# 定义一个函数来获取"Areas of Expertise"的内容
def get_areas_of_expertise(Link):
    try:
        response = requests.get(Link)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            areas_of_expertise_elem = soup.find('div', class_='field--name-field-areas-of-expertise')
            if areas_of_expertise_elem:
                areas_of_expertise = [item.text.strip() for item in areas_of_expertise_elem.find_all('div', class_='field__item')]
                return ', '.join(areas_of_expertise)
    except:
        pass
    return 'None'

# 在DataFrame中应用函数来获取"Areas of Expertise"的内容
name_list['areas_of_expertise'] = name_list['Link'].apply(get_areas_of_expertise)

# 打印更新后的DataFrame
print(name_list.head())


                          Name                        Position  \
0         Urbain Weyemi, Ph.D.           Stadtman Investigator   
1           Sue Wickner, Ph.D.  NIH Distinguished Investigator   
2   Brigitte C. Widemann, M.D.                           Chief   
3  Wyndham Wilson, M.D., Ph.D.             Senior Investigator   
4     David A. Wink Jr., Ph.D.                    Deputy Chief   

                   Email                                               Link  \
0  urbain.weyemi@nih.gov  https://ccr.cancer.gov/staff-directory/urbain-...   
1  wickners@mail.nih.gov  https://ccr.cancer.gov/staff-directory/sue-wic...   
2          bw42y@nih.gov  https://ccr.cancer.gov/staff-directory/brigitt...   
3   wilsonw@mail.nih.gov  https://ccr.cancer.gov/staff-directory/wyndham...   
4      wink@mail.nih.gov  https://ccr.cancer.gov/staff-directory/david-a...   

                                  areas_of_expertise  
0  Cancer Metabolism, Cancer Metabolism, Cancer a...  
1  Molecular Mecha

In [30]:
#添加一列building，一列location，一列phone
from bs4 import BeautifulSoup
import requests

# 从HTML页面中提取信息并返回一个字典
def extract_info(Link):
    response = requests.get(Link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    building_tag = soup.find('li', class_='icon-building')
    location_tag = soup.find('li', class_='icon-location')
    phone_tag = soup.find('li', class_='icon-phone')
    
    building = building_tag.text.strip() if building_tag else 'None'
    location = location_tag.text.strip() if location_tag else 'None'
    phone = phone_tag.text.strip() if phone_tag else 'None'
    
    return {'Building': building, 'Location': location, 'Phone': phone}

# 对name list中的每个链接提取信息，并将结果添加到新的列中
name_list['Building'] = name_list['Link'].apply(lambda link: extract_info(link).get('Building', 'None'))
name_list['Location'] = name_list['Link'].apply(lambda link: extract_info(link).get('Location', 'None'))
name_list['Phone'] = name_list['Link'].apply(lambda link: extract_info(link).get('Phone', 'None'))

# 打印更新后的DataFrame
print(name_list.head())

                          Name                        Position  \
0         Urbain Weyemi, Ph.D.           Stadtman Investigator   
1           Sue Wickner, Ph.D.  NIH Distinguished Investigator   
2   Brigitte C. Widemann, M.D.                           Chief   
3  Wyndham Wilson, M.D., Ph.D.             Senior Investigator   
4     David A. Wink Jr., Ph.D.                    Deputy Chief   

                   Email                                               Link  \
0  urbain.weyemi@nih.gov  https://ccr.cancer.gov/staff-directory/urbain-...   
1  wickners@mail.nih.gov  https://ccr.cancer.gov/staff-directory/sue-wic...   
2          bw42y@nih.gov  https://ccr.cancer.gov/staff-directory/brigitt...   
3   wilsonw@mail.nih.gov  https://ccr.cancer.gov/staff-directory/wyndham...   
4      wink@mail.nih.gov  https://ccr.cancer.gov/staff-directory/david-a...   

                                  areas_of_expertise  \
0  Cancer Metabolism, Cancer Metabolism, Cancer a...   
1  Molecular Mec

In [31]:
#将research summary的文字内容添加到name list里面
from bs4 import BeautifulSoup
import requests

# 从HTML页面中提取文本内容
def extract_text(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    research_summary = soup.find('h2', text='RESEARCH SUMMARY')
    if research_summary:
        summary_text = ''
        for sibling in research_summary.find_next_siblings():
            if sibling.name == 'h2':
                break
            if sibling.name == 'p':
                summary_text += sibling.get_text() + '\n'
        return summary_text.strip()
    else:
        return 'None'

# 对name list中的每个链接提取文本内容，并将结果添加到新的列中
name_list['Research Summary'] = name_list['Link'].apply(lambda link: extract_text(link))

# 打印更新后的DataFrame
print(name_list.head())

                          Name                        Position  \
0         Urbain Weyemi, Ph.D.           Stadtman Investigator   
1           Sue Wickner, Ph.D.  NIH Distinguished Investigator   
2   Brigitte C. Widemann, M.D.                           Chief   
3  Wyndham Wilson, M.D., Ph.D.             Senior Investigator   
4     David A. Wink Jr., Ph.D.                    Deputy Chief   

                   Email                                               Link  \
0  urbain.weyemi@nih.gov  https://ccr.cancer.gov/staff-directory/urbain-...   
1  wickners@mail.nih.gov  https://ccr.cancer.gov/staff-directory/sue-wic...   
2          bw42y@nih.gov  https://ccr.cancer.gov/staff-directory/brigitt...   
3   wilsonw@mail.nih.gov  https://ccr.cancer.gov/staff-directory/wyndham...   
4      wink@mail.nih.gov  https://ccr.cancer.gov/staff-directory/david-a...   

                                  areas_of_expertise  \
0  Cancer Metabolism, Cancer Metabolism, Cancer a...   
1  Molecular Mec

In [32]:
# 保存 DataFrame 到表格文件中
name_list.to_csv('nci_PI_list.csv', index=False)