In [1]:
import requests
from bs4 import BeautifulSoup
import unicodedata

## Find Basic Info of a Photographer

In [None]:
def get_wiki_infobox(photographer):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": photographer,
        "format": "json",
        "prop": "text",
        "section": 0  # 获取首页内容，通常包含 infobox
    }
    response = requests.get(url, params=params)
    data = response.json()

    if "parse" not in data:
        print("未找到页面内容")
        return None

    html_content = data["parse"]["text"]["*"]
    soup = BeautifulSoup(html_content, "html.parser")
    infobox = soup.find("table", class_="infobox")
    return infobox

def extract_basic_info(infobox):
    info = {}
    if not infobox:
        return info

    # 遍历所有<tr>标签，查找“Born”和“Nationality”等行
    rows = infobox.find_all("tr")
    for row in rows:
        header = row.find("th")
        if header:
            key = header.get_text(strip=True)
            # 获取对应的<td>标签内容
            data_cell = row.find("td")
            if data_cell:
                value = data_cell.get_text(separator=" ", strip=True)
                # 根据常见的关键词匹配需要的信息
                if "Born" in key:
                    info["Born"] = value
                elif "Nationality" in key:
                    info["Nationality"] = value
                elif 'Website' in key:
                    info['Website'] = value
                elif "Name" in key and "Cindy Sherman" not in info:
                    info["Name"] = value
    return info


# Test case
# if __name__ == "__main__":
#     photographer = "Hiroshi Sugimoto"
#     infobox = get_wiki_infobox(photographer)
#     basic_info = extract_basic_info(infobox)
    
#     if basic_info:
#         print(f"Basic Info for {photographer}:")
#         for key, value in basic_info.items():
#             print(f"{key}: {value}")
#     else:
#         print("Didn't find any related info")

Basic Info for Hiroshi Sugimoto:
Born: ( 1948-02-23 ) February 23, 1948 (age 76) Tokyo, Japan
Website: www .sugimotohiroshi .com


## Find Publication and Exhibition Info of a Photographer

In [None]:
import requests
from bs4 import BeautifulSoup

def normalize_text(text):
    # 使用 NFKC 规范化字符，修正组合字符问题
    return unicodedata.normalize('NFKC', text)

def get_wiki_page_html(photographer):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": photographer,
        "format": "json",
        "prop": "text"
    }
    response = requests.get(url, params=params)
    # 确保使用 utf-8 编码
    response.encoding = "utf-8"
    data = response.json()
    if "parse" not in data:
        print(f"Didn't find page for {photographer}")
        return None
    html_content = data["parse"]["text"]["*"]
    # 标准化 HTML 文本
    return normalize_text(html_content)

def extract_publications_list(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    # find id = 'Publication' <h2>
    h2_publications = soup.find("h2", id="Publications")
    if not h2_publications:
        print("Didn't find <h2 id='Publications'>")
        return None
    # find next <ul> which will contain publication lists
    ul = h2_publications.find_next("ul")
    if not ul:
        print("did not find <h2 id='Publications'> <ul>")
        return None
    # extract all <li> info lists
    publications = [normalize_text(li.get_text(separator=" ", strip=True)) for li in ul.find_all("li")]
    return publications

# def extract_exhibition_list(html_content):
#     soup = BeautifulSoup(html_content, "html.parser")
#     # find id = 'Publication' <h2>
#     h2_exhibition = soup.find("h2", id="Exhibitions")
#     if not h2_exhibition:
#         print("Didn't find <h2 id='exhibition'>")
#         return None
#     # find next <ul> which will contain publication lists
#     ul = h2_exhibition.find_next("ul")
#     if not ul:
#         print("did not find <h2 id='exhibition'> <ul>")
#         return None
#     # extract all <li> info lists
#     exhibition = [normalize_text(li.get_text(separator=" ", strip=True)) for li in ul.find_all("li")]
#     return exhibition

# # Test case
# if __name__ == "__main__":
#     photographer = "Daidō Moriyama"
#     html_content = get_wiki_page_html(photographer)
#     if html_content:
#         pubs = extract_publications_list(html_content)
#         if pubs:
#             print("Extracted Publications List:")
#             for pub in pubs:
#                 print(pub)
#         else:
#             print("did not find publishment list, it may not shown in publication section in Wiki")
#         exhibitions = extract_exhibition_list(html_content)
#         if exhibitions:
#             print("Extracted exhibition List:")
#             for exhibition in exhibitions:
#                 print(exhibition)
#         else:
#             print("did not find exhibitions list, it may not shown in publication section in Wiki")
#     else:
#         print("did not get the HTML。")





Extracted Publications List:
Nippon Gekijo Shashincho (= Japan: A Photo Theater ). Muromachi Shob, 1968. 216 pages, with text in two places by Shūji Terayama (Japanese). Revised edition: Shinchosha ; Photo Musée, 1995. ISBN 978-4-10-602418-4 (Japanese). New edition: Bookshop M/Getsuyosha 2018. Signed edition of 700 copies (Japanese/English).
Revised edition: Shinchosha ; Photo Musée, 1995. ISBN 978-4-10-602418-4 (Japanese).
New edition: Bookshop M/Getsuyosha 2018. Signed edition of 700 copies (Japanese/English).
Karyudo = A Hunter. For Jack Kerouac. Tokyo: Chuo-koron-sha, 1972. Text by Yokoo Tadanori (Japanese/English). Reprint: Tokyo: Taka Ishii Gallery, 1997.
Reprint: Tokyo: Taka Ishii Gallery, 1997.
Sashin yo Sayonara (= Bye Bye Photography ). Tokyo: Shashin hyoron-sha, 1972. Reprint included in: The Japanese Box. Facsimiles of six publications of the Provoke era, edited by Christoph Schifferli. Concept and design by Gerhard Steidl and Karl Lagerfeld . Paris: Edition 7L, und Götting

## Build Basic Dataset of 200 Photographers

In [35]:
import pandas as pd
import numpy as np
import csv
import time
import requests
from bs4 import BeautifulSoup

In [28]:
photographer_list = [
        "Richard Avedon",
        "William Eugene Smith",
        "Helmut Newton",
        "Irving Penn",
        "Guy Bourdin",
        "Henri Cartier-Bresson",
        "Diane Arbus",
        "Elliott Erwitt",
        "Walker Evans",
        "Martin Parr",
        "Juergen Teller",
        "Nick Knight",
        "David Bailey",
        "Cindy Sherman",
        "Andreas Gursky",
        "Edward Weston",
        "Garry Winogrand",
        "Bruce Weber",
        "Man Ray",
        "Paolo Roversi",
        "Herb Ritts",
        "Annie Leibovitz",
        "Ansel Adams",
        "David LaChapelle",
        "William Klein",
        "Bill Brandt",
        "Ralph Gibson",
        "Stephen Shore",
        "Robert Frank",
        "André Kertész",
        "Chuck Close",
        "Robert Mapplethorpe",
        "Steven Meisel",
        "Peter Lindbergh",
        "August Sander",
        "Nancy Goldin",
        "Weegee",
        "Don McCullin",
        "Slim Aarons",
        "William Eggleston",
        "Joel Peter Witkin",
        "Anton Corbijn",
        "Brassaï",
        "Erwin Blumenfeld",
        "Duane Michals",
        "Mario Testino",
        "Mary Ellen Mark",
        "Larry Clark",
        "Marcus Piggott",
        "Corinne Day",
        "Cecil Beaton",
        "Eric Boman",
        "Patrick Demarchelier",
        "Bert Hardy",
        "Tim Walker",
        "Terry Richardson",
        "Norman Parkinson",
        "Lord Snowdon",
        "Horst P. Horst",
        "Philip Jones Griffiths",
        "Jeanloup Sieff",
        "Bob Carlos Clarke",
        "Mick Rock",
        "Sebastião Salgado",
        "David Loftus",
        "Brian Duffy",
        "Simon Norfolk",
        "Nobuyoshi Araki",
        "Ellen Von Unwerth",
        "Leni Riefenstahl",
        "Edward Steichen",
        "Alfred Stieglitz",
        "Roger Fenton",
        "Sarah Moon",
        "Frank Horvat",
        "Alexander Rodchenko",
        "Julia Margaret Cameron",
        "Angus McBean",
        "Deborah Turbeville",
        "Tim Page",
        "Harri Peccinotti",
        "Eve Arnold",
        "Jane Bown",
        "Michael Thompson",
        "Oliviero Toscani",
        "Pierre Commoy",
        "Robert Doisneau",
        "Joel Sternfeld",
        "Richard Billingham",
        "Paul Strand",
        "Chris Killip",
        "Tony Ray-Jones",
        "Helen Levitt",
        "Robert Capa",
        "George Hurrell",
        "Jacques Henri Lartigue",
        "Bert Stern",
        "Peter Beard",
        "John Rankin Waddell",
        "Joel Meyerowitz",
        "Saul Leiter",
        "Harry Gruyaert",
        "Edward Burtynsky",
        "Vivian Maier",
        "Alex Webb",
        "Raghubir Singh",
        "Daidō Moriyama",
        "Lee Friedlander",
        "Paul Graham",
        "Masahisa Fukase",
        "Hiroshi Sugimoto",
        "Bernd and Hilla Becher",
        "Thomas Struth",
        "Bernard Faucon",
        "Nicholas Nixon",
        "Thomas Ruff",
        "Candida Hofer",
        "David Hockney",
        "Catherine Opie",
        "Gregory Crewdson",
        "Jeff Wall",
        "Larry Sultan",
        "Louise Lawler",
        "Rineke Dijkstra",
        "Sally Mann",
        "Sandy Skoglund",
        "Shirin Neshat",
        "Sophie Calle",
        "Wolfgang Tillmans",
        "Thomas Demand",
        "Richard Misrach",
        "Lucinda Devlin",
        "Todd Hido",
        "Jan Groover",
        "Barbara Kasten",
        "Lyle Ashton Harris",
        "John Divola",
        "Gillian Wearing",
        "Lisa Oppenheim",
        "Nancy Burson",
        "Loretta Lux",
        "Mishaka Henner",
        "Imogen Cunningham",
        "Berenice Abbott",
        "Lillian Bassman",
        "Gordon Parks",
        "Arnold Newman",
        "O. Winston Link",
        "Bruce Davidson",
        "Lee Miller",
        "László Moholy-Nagy",
        "Eugène Atget",
        "Margaret Bourke-White",
        "Clyde Butcher",
        "Francesca Woodman",
        "Camille Silvy",
        "Nadar",
        "Matthew Brady",
        "Gjon Mili",
        "Morris Engel",
        "Bill Cunningham",
        "Martha Cooper",
        "Rinko Kawauchi",
        "Iwan Baan",
        "Alex Prager",
        "Trent Parke",
        "Robert Polidori",
        "Laura Levine",
        "Lynsey Addario",
        "James Nachtwey",
        "Eugene Richards",
        "David Sims",
        "Platon",
        "Gustave Le Gray",
        "Lewis Hine",
        "Alec Soth",
        "Tyler Mitchell",
        "Zanele Muholi",
        "Charles Sheeler",
        "Robert Adams",
        "Roger Ballen",
        "Viviane Sassen",
        "Kirsty Mitchell",
        "Doug Rickard",
        "Jill Freedman",
        "Larry Towell",
        "Andreas Feininger",
        "Ruth Orkin",
        "Philip-Lorca diCorcia",
        "Ryan McGinley",
        "Martin Schoeller",
        "Jamel Shabazz",
        "Mitch Epstein",
        "Rankin",
        "John Sexton",
        "Richard Mosse",
        "Ari Marcopoulos",
        "Camille Seaman",
        "Boris Mikhailov",
        "Tim Hetherington",
        "Omar Victor Diop"
    ]
print(f'Total Photographer number is {len(photographer_list)}')

Total Photographer number is 201


In [45]:
# Del Replicates
photographer_list = list(set(photographer_list))
photographer_list.sort()

output_file = "200_photographers_info1.csv"
fieldnames = ["PhotographerID", "Name", "Born", "Website", "Publications"]

with open(output_file, mode="w", newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for idx, photographer in enumerate(photographer_list, start=1):
            print(f"Processing: {photographer}")
            info = {}
            info["PhotographerID"] = idx
            info["Name"] = photographer
            
            # 获取基本信息
            infobox = get_wiki_infobox(photographer)
            basic_info = extract_basic_info(infobox)
            info["Born"] = basic_info.get("Born", "N/A")
            info["Website"] = basic_info.get("Website", "N/A")

            # 获取出版物信息
            html_content = get_wiki_page_html(photographer)
            pubs = extract_publications_list(html_content) if html_content else None
            if pubs:
                info["Publications"] = " | ".join(pubs)
            else:
                info["Publications"] = ""

            writer.writerow(info)

        
print(f"Data extraction complete. CSV saved as {output_file}")

        


Processing: Alec Soth
Processing: Alex Prager
Processing: Alex Webb
Didn't find <h2 id='Publications'>
Processing: Alexander Rodchenko
Didn't find <h2 id='Publications'>
Processing: Alfred Stieglitz
Didn't find <h2 id='Publications'>
Processing: Andreas Feininger
Didn't find <h2 id='Publications'>
Processing: Andreas Gursky
Processing: André Kertész
Processing: Angus McBean
Didn't find <h2 id='Publications'>
Processing: Annie Leibovitz
Didn't find <h2 id='Publications'>
Processing: Ansel Adams
Didn't find <h2 id='Publications'>
Processing: Anton Corbijn
Didn't find <h2 id='Publications'>
Processing: Ari Marcopoulos
Didn't find <h2 id='Publications'>
Processing: Arnold Newman
Didn't find <h2 id='Publications'>
Processing: August Sander
Processing: Barbara Kasten
Didn't find <h2 id='Publications'>
Processing: Berenice Abbott
Didn't find <h2 id='Publications'>
Processing: Bernard Faucon
Didn't find <h2 id='Publications'>
Processing: Bernd and Hilla Becher
Didn't find <h2 id='Publications'

In [53]:
# Testing Cases
df = pd.read_csv('200_photographers_info1.csv')
print(df.iloc[7])

print(df.shape)

PhotographerID                                                    8
Name                                                  André Kertész
Born              Andor Kertész 2 July 1894 ( 1894-07-02 ) Budap...
Website                                https://www.andrekertesz.org
Publications      1933: Enfants published in Paris by Éditions d...
Name: 7, dtype: object
(201, 5)


## Using Hugging Face Transformer to Add a New Column Called Biograpy

In [66]:
!pip install transformers
!pip install torch




[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [67]:
from bs4 import BeautifulSoup
from transformers import pipeline

In [None]:
def get_wiki_full_content(photographer):
    """
    使用 Wikipedia API 获取指定摄影师页面的完整纯文本内容
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "titles": photographer,
        "explaintext": True,   # 以纯文本形式返回
        "format": "json",
        "redirects": 1         # 自动处理重定向
    }
    response = requests.get(url, params=params)
    data = response.json()

    pages = data.get("query", {}).get("pages", {})
    if not pages:
        print("未能获取页面内容")
        return None

    # pages 是一个字典，取第一个页面
    page = list(pages.values())[0]
    extract = page.get("extract", "")
    return extract

def summarize_text(text, max_length=130, min_length=100):
    """
    使用Hugging Face的summarization pipeline对文本进行摘要。
    max_length和min_length可以根据实际情况调整，目标摘要大约100词左右。
    注意：这里的长度单位是token数，不完全等同于单词数。
    """
    # We choose bart large cnn here
    summarizer = pipeline(task='summarization',model='facebook/bart-large-cnn')
    # 为避免输入文本过长，可限制摘要的输入长度（如截取前1000个字符）
    short_text = text[:1000]
    summary = summarizer(short_text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Testing cases 'Cindy Sherman'
# if __name__ == "__main__":
#     photographer = "Cindy Sherman"  
#     page_text = get_wiki_full_content(photographer)
#     print(page_text)
#     if page_text:
#         biography = summarize_text(page_text, max_length=130, min_length=100)
#         print("Biography:")
#         print(biography)
#     else:
#         print("未能提取到页面文本。")

Cynthia Morris Sherman (born January 19, 1954) is an American artist whose work consists primarily of photographic self-portraits, depicting herself in many different contexts and as various imagined characters.
Her breakthrough work is often considered to be the collection Untitled Film Stills, a series of 70 black-and-white photographs of herself evoking typical female roles in performance media (especially arthouse films and popular B-movies).


== Early life and education ==
Sherman was born in 1954, in Glen Ridge, New Jersey, the youngest of the five children of Dorothy and Charles Sherman. Shortly after her birth, her family moved to the township of Huntington, Long Island. Her father worked as an engineer for Grumman Aircraft. Her mother taught reading to children with learning difficulties. Sherman has described her mother as good to a fault, and her father as strict and cruel. She was raised Episcopalian.
In 1972, Sherman enrolled in the visual arts department at Buffalo State

In [75]:
# Applying Biography Summmary to All photographer, and then put them in a new column called Biography
df = pd.read_csv('200_photographers_info1.csv')
biographies = []

# 遍历 DataFrame 中的每一行，处理每个摄影师
for index, row in df.iterrows():
    photographer = row["Name"]
    print(f"Processing: {photographer}")
    page_text = get_wiki_full_content(photographer)
    if page_text:
        bio = summarize_text(page_text, max_length=130, min_length=100)
        biographies.append(bio)
        print(bio)
    else:
        biographies.append("")

# 添加新的 Biography 列
df["Biography"] = biographies
# 保存结果到新的 CSV 文件（推荐使用 utf-8-sig 编码以防 Excel 乱码）
output_file = "photographers_with_biography.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"数据提取完成，保存至 {output_file}")

Processing: Alec Soth
Alec Soth is an American photographer based in Minneapolis. His work tends to focus on the "off-beat, hauntingly banal images of modern America" He has had various books of his work published by major publishers as well as self-published through his own Little Brown Mushroom. He has received fellowships from the McKnight and Jerome Foundations, was the recipient of the 2003 Santa Fe Prize for Photography, and in 2021 received an Honorary Fellowship of the Royal Photographic Society.
Processing: Alex Prager
Alex Prager is an American artist, director, and screenwriter based in Los Angeles. She is known for her uncanny and highly staged images and films that blur the line between artifice and reality. At age 14, she dropped out of school and traveled to Switzerland on her own, where she worked at a knife store in Lucerne. Prager avoided formal art education and instead purchased a Nikon N90s camera and printed photographs in a home darkroom. She was inspired to purs

Your max_length is set to 130, but your input_length is only 58. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)


Alex Webb (musician) (born 1961), British songwriter, musician and former journalist. Alex Webb (photographer) ( born 1952), American photojournalist and photographer. "Webb" may also mean "weaker" or "welder" and "weirder" and may be used to refer to a group or a person of the same name, such as The Smiths or The Rolling Stones. It is also used as the name of a series of books, including "The Adventures of Alex Webb"
Processing: Alexander Rodchenko
Aleksander Mikhailovich Rodchenko was a Russian and Soviet artist, sculptor, photographer, and graphic designer. He was one of the founders of constructivism and Russian design; he was married to the artist Varvara Stepanova. His photography was socially engaged, formally innovative, and opposed to a painterly aesthetic. He often shot his subjects from odd angles to shock the viewer and to postpone recognition. He wrote: "One has to take several different shots of a subject, from different points of view and in different situations"
Process

Your max_length is set to 130, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Boris Mikhailov is a former Soviet international ice hockey player. He was also a member of the Comintern and a photographer. He is the father of Bulgarian footballer Borislav Mihaylov. Boris Mikhailov may also refer to: Boris Mikhaylov, Boris Mikhalov, Boris Mikhailova, and Boris Mihalov-Mihalova. It is also known as Boris Mkhalov or Boris Makhalov and Boris Mikhailo.
Processing: Brassaï
Gyula Halász was born on 9 September 1899 in Budapest, Hungary. He died on 8 July 1984 in Paris, France. He was one of the most prolific painters of the 20th century. His work was published around the world, including in the U.S. and Europe. He is survived by his wife, two children and a step-son. He also leaves behind a wife and a son. He had a son, who is also a painter, and a daughter, who are also artists.
Processing: Brian Duffy


Your max_length is set to 130, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


Brian Duffy is a Scottish actor, writer, and artist. He was also an astronaut, photographer, and weightlifter. He is the drummer for The Stranglers. Brian Duffy may also refer to: Brian Duffy, Brian John Duffy, and Brian Duffy of Jet Black, among others. It is also used as a nickname for Brian Duffy and his wife, Anne Duffy, who is also an actor and writer. For more information, visit Brian Duffy's official website at: www.brianduffy.com.
Processing: Bruce Davidson


Your max_length is set to 130, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


Bruce Davidson may refer to:Bruce Davidson (footballer) (born 1950), Australian footballer for Footscray. Bruce Davidson (politician) ( born 1951), Australian politician. Bruce Davison (born 1946), American actor (born 1933) Bruce Davidson may also be the name of a number of other people, including: Bruce Davidson, an American equestrian, and Bruce Davidson, an Australian footballer and politician. The surname Davidson is also used as a nickname for several other people.
Processing: Bruce Weber


Your max_length is set to 130, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


Bruce Weber was an Australian sports administrator and basketball coach. He was also a photographer and film director. Bruce Weber may also refer to: Bruce Weber, Bruce Weber and Bruce Weber's son,  Bruce Weber, and Bruce Weber's brother, Paul Weber, all of whom played basketball for the Australian National Basketball Team. He is also known as Bruce Weber or Bruce Weber as a baseball player and a basketball coach for the New York Mets and New York Knicks. He also played for the University of Wisconsin-Madison.
Processing: Camille Seaman
Camille Seaman is an American photographer. Her work mainly concerns the polar regions, where she captures the effects of climate change. Seaman reached wider attention with the production of her 2003 series of photographs of the Arctic Ocean island of Svalbard. She is of Native American and African-American descent through her father and mother respectively. She studied photography with Jan Groover at the State University of New York at Purchase, gradu

Your max_length is set to 130, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Lucinda Devlin is an American photographer. Devlin lives and works in Greensboro, North Carolina. Her work is included in the collection of the Museum of Fine Arts Houston, the Whitney Museum of American Art, and the San Francisco Museum of Modern Art. Her mid-2000s project Field Culture documented American crop farming. In her series The Omega Suites, she documented execution chambers across the U.S.  Her work is part of the collections of the museums of Houston, San Francisco, and New York.
Processing: Lyle Ashton Harris
Lyle Ashton Harris is an American artist. Harris uses his works to comment on societal constructs of sexuality and race, while exploring his own identity as a queer, black man. Harris was mostly raised by his chemistry professor mother Rudean after she divorced Harris's father, between New York City and Dar Es Salaam, Tanzania. Harris has expressed the impact of the absence of his father as a large impact on his personal and emotional development, which would later b

Your max_length is set to 130, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)


Paul Graham may refer to: American college football player and coach, basketball player, novelist, bodybuilder, and photographer. Paul Graham may also be the name of a person or a group of people. It is also used as the name for a series of books by the same name by Paul Graham, published in the 1960s and 1970s. The book series was called "Paul Graham: A Portrait of a Novelist" and was published by Simon & Schuster, Inc. in 1973.
Processing: Paul Strand
Paul Strand was an American photographer and filmmaker. He helped establish photography as an art form in the 20th century. In 1936, he helped found the Photo League, a cooperative of photographers. His diverse body of work, spanning six decades, covers numerous genres and subjects throughout the Americas, Europe, and Africa. He died on March 31, 1976, in New York, New York. He is survived by his wife, the former Matilda Stransky (née Arnstein), and his son, Nathaniel Paul Stransky.
Processing: Peter Beard
Peter Hill Beard was an Americ

Your max_length is set to 130, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)


Tim Page is a British-Australian photojournalist and actor. He is also a music critic, biographer, professor and memoirist. Tim Page may also refer to: Tim Page, Timothy Page, Tim Page and Tim Page Jr. and Timothy Page and his son Timothy Page Jr., all of whom were born in the UK, Australia, New Zealand and the U.S. It also means "Tim Page" or "Timothy Page's" and may refer to a number of other people.
Processing: Tim Walker
Timothy Walker HonFRPS (born 1970) is a British fashion photographer who regularly works for Vogue, W and Love magazines. Walker was awarded a third prize as The Independent Young Photographer Of The Year. He has also shot notable covers for W Magazine, i-D, Vanity Fair, Another Man, and Bet. He is based in London and has worked for Richard Avedon and Condé Nast since 1994. He was awarded an HBC in Photography at Exeter College of Art and Design.
Processing: Todd Hido
Todd Hido (born August 25, 1968) is an American photographer. He has produced 17 books, had his wo

Your max_length is set to 130, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Tyler Mitchell is a musician and photographer. Tyler Mitchell may also refer to: Tyler-Jane Mitchel, Taylor Mitchell, Tyler Mitchell, and Tyler Mitchell. It is also the name of a U.S. photographer and a Canadian country folk singer. The term is also used to refer to a New Zealand actress and a New York City photographer. For more information, visit Tyler Mitchell's official website at: http://www.tyler Mitchell's website.com/Tyler-Mitchell.
Processing: Vivian Maier
Vivian Dorothy Maier was an American street photographer. She took more than 150,000 photographs during her lifetime. Many of her negatives were never developed. Her photographs were first published on the Internet in July 2008, by Ron Slattery, but the work received little response. In October 2009, Maloof linked his blog to a selection of Maier's photographs on the image-sharing website Flickr, and the results went viral, with thousands of  photos being posted on the site.
Processing: Viviane Sassen
Viviane Sassen is a Dut

#### Adding Biography this column to df

In [None]:
# For each photographer in column Name, call summarize_text and get_wiki_full_content

In [73]:
# Extract Award
qa_pipeline = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")

text = page_text

question = "What awards and recognitions has this photographer received?"

result = qa_pipeline(question=question, context=text)

print("Extracted Awards:", result["answer"])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted Awards: 1981: Artist-in-residence, Light Work, Syracuse, New York


### Photographer's Profile (Image)

In [None]:
# import requests

# def get_wiki_image_and_license(photographer):
#     url = "https://en.wikipedia.org/w/api.php"
#     params = {
#         "action": "query",
#         "titles": photographer,
#         "prop": "pageimages|imageinfo",
#         "iiprop": "url|extmetadata",
#         "format": "json",
#         "pithumbsize": 500,
#         "redirects": 1
#     }
#     response = requests.get(url, params=params)
#     data = response.json()
#     pages = data.get("query", {}).get("pages", {})
#     for page_id in pages:
#         page = pages[page_id]
#         if "thumbnail" in page:
#             image_url = page["thumbnail"]["source"]
#             # extmetadata 中可能存储许可信息，尝试获取
#             imageinfo = page.get("imageinfo", [{}])
#             if imageinfo:
#                 extmetadata = imageinfo[0].get("extmetadata", {})
#                 license_info = extmetadata.get("License", {}).get("value", "Unknown")
#                 return image_url, license_info
#     return None, None

# photographer_name = "Cindy Sherman"
# image_url, license_info = get_wiki_image_and_license(photographer_name)
# print(f"Image URL: {image_url}")
# print(f"License: {license_info}")

Image URL: https://upload.wikimedia.org/wikipedia/commons/5/5e/Cindy_Sherman_%28cropped%29.jpg
License: Unknown


### Data preprocessing (Split Born to Born Year and Born Place)


In [78]:
print(df.head())    


   PhotographerID                 Name  \
0               1            Alec Soth   
1               2          Alex Prager   
2               3            Alex Webb   
3               4  Alexander Rodchenko   
4               5     Alfred Stieglitz   

                                                Born               Website  \
0  1969 (age 55–56) Minneapolis, Minnesota , Unit...         alecsoth .com   
1   1979 (age 45–46) Los Angeles , California , U.S.  www .alexprager .com   
2                                                NaN                   NaN   
3  Aleksander Mikhailovich Rodchenko ( 1891-12-05...                   NaN   
4  ( 1864-01-01 ) January 1, 1864 Hoboken, New Je...                   NaN   

                                        Publications  \
0  Sleeping by the Mississippi. With essays by Pa...   
1  Polyester , Alex Prager Studio. 2007. ASIN B00...   
2                                                NaN   
3                                                NaN  

In [79]:
import re

In [84]:
def parse_born(born_str):
    """
    extract:
    (1) birth_year
    (2) location
    """
    if not isinstance(born_str, str):
        return "",""
    
    year_match = re.search(r"(\d{4})", born_str)
    birth_year = year_match.group(1) if year_match else ""

    no_parentheses = re.sub(r"\(.*?\)", "", born_str)

    tokens = no_parentheses.split()
    if len(tokens) > 1:
        location_tokens = tokens[-1:]
    else:
        location_tokens = tokens

    location = " ".join(location_tokens)

    return birth_year, location


# if __name__ == "__main__":
#     sample_test = [
#         "1969 (age 55–56) Minneapolis, Minnesota , United States",
#         "( 1902-02-20 ) February 20, 1902 San Francisco , California , U.S.",
#         "Cynthia Morris Sherman ( 1954-01-19 ) January 19, 1954 (age 71) Glen Ridge, New Jersey , U.S.",
#         "Charles Thomas Close ( 1940-07-05 ) July 5, 1940 Monroe, Washington , U.S.",
#         "Hiromichi Moriyama ( 1938-10-10 ) October 10, 1938 (age 86) Ikeda , Osaka , Japan"
#     ]


#     for s in sample_test:
#         year, loc = parse_born(s)
#         print(f"Raw: {s}")
#         print(f" -> birth_year={year}, last_6_words_location={loc}")
#         print("-"*50)


In [86]:
# Using parse_born to all df
df['BirthYear'], df["BirthLocation"] = zip(*df["Born"].apply(parse_born))

print(df[['BirthYear', "BirthLocation"]].tail())
df.to_csv('Photographers_V1.0.csv')

    BirthYear BirthLocation
196      1939          U.S.
197                        
198                        
199      1968       Germany
200      1972        Africa
