In [43]:
# !pip install openai

In [173]:
import pandas as pd
import json
import geopandas as gpd
import os
from dotenv import load_dotenv
import requests
import openai
import re

pd.set_option('display.max_colwidth', None)

%matplotlib inline

In [None]:
#________________________________________________________________________________________________________________
#This notebook includes parsing the text and categories from chat GPT API
#________________________________________________________________________________________________________________


In [105]:
# Read in the layer
places_wgs84 = gpd.read_file('data/places_wgs84.geojson')

In [106]:
# # Creating columns to store ai outputs
# places_wgs84["ai_short_description"] = None
# places_wgs84["ai_headlines"] = None

In [107]:
places_wgs84.head(2)

Unnamed: 0,GAZETTEER_ENTRY.NAME1,GAZETTEER_ENTRY.MOST_DETAIL_VIEW_RES,GAZETTEER_ENTRY.LEAST_DETAIL_VIEW_RES,GAZETTEER_ENTRY.SAME_AS_DBPEDIA,geometry
0,Beckton,20000,25000,http://dbpedia.org/resource/Beckton,POINT (0.05882 51.51583)
1,Plaistow,18000,25000,"http://dbpedia.org/resource/Plaistow,_Newham",POINT (0.02419 51.52561)


In [46]:
# #________________________________________________________________________________________________________________
# # This API requires paid tokens, so I append the text I generated on the website white testing the requests' text
# #________________________________________________________________________________________________________________
# # Update ai_headlines for Stratford
# places_wgs84.loc[places_wgs84["GAZETTEER_ENTRY.NAME1"] == "Stratford", "ai_headlines"] = (
#     "Railway Junction Legacy, Industrial Revolution Hub, Olympic Regeneration Catalyst"
# )
# # Update ai_headlines for North Woolwich
# places_wgs84.loc[places_wgs84["GAZETTEER_ENTRY.NAME1"] == "Stratford", "ai_headlines"] = (
#     "Maritime Gateway, Dockyard Hub, Post-Industrial Revival"
# )
# # Update ai_short_description for Stratford
# places_wgs84.loc[places_wgs84["GAZETTEER_ENTRY.NAME1"] == "Stratford", "ai_short_description"] = (
#     "Stratford is a place of industrial history and bold regeneration, where its transformation from a railway hub and manufacturing center to an Olympic legacy site has sparked vibrant cultural and economic growth."
# )
# # Update ai_short_description for Harrow
# places_wgs84.loc[places_wgs84["GAZETTEER_ENTRY.NAME1"] == "Harrow", "ai_short_description"] = (
#     "Harrow is a place of academic distinction, where its prestigious schools and historic village charm "
#     "have evolved alongside suburban development, creating a unique blend of scholarly heritage and modern "
#     "family-friendly living."
# )
# # Update ai_short_description for North Woolwich
# places_wgs84.loc[places_wgs84["GAZETTEER_ENTRY.NAME1"] == "North Woolwich", "ai_short_description"] = (
#     "North Woolwich is a place of maritime industry and post-industrial reinvention, where its historic docklands and warehouses now give way to new residential and creative spaces, reflecting the area's ongoing transformation."
# )

In [180]:
#________________________________________________________________________________________________________________
# Creating json for batch request for headlines
#________________________________________________________________________________________________________________

with open("batch_requests.jsonl", "w") as f:
    for index, row in places_wgs84.iterrows():
        # Check if ai_headlines is None or NaN
        # if row["ai_headlines"] is None or pd.isna(row["ai_headlines"]):
        place_name = row["GAZETTEER_ENTRY.NAME1"]

        request_body = {
            "custom_id": place_name,  # Use place name as custom_id
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",  # Or your model choice
                "messages": [
                    {"role": "user", "content": f"Provide three distinct short phrases that capture the history and character of {place_name} in London. Focus on what makes this place specific—its history, roots, famous events and places in it, historic industries, parks, landmarks, or defining characteristics, and use simple, natural language. Avoid generic or overly grand phrases, don't mention community, make them feel like something a local might say. Keep each phrase under 5 words. Format them as comma-separated]"}
                ]
            }
        }

        # Write the request to the .jsonl file
        f.write(json.dumps(request_body) + "\n")


In [110]:
# #________________________________________________________________________________________________________________
# # Reading the batch for headlines
# #________________________________________________________________________________________________________________

# headlines_batch_file_path = "batch_67c7ae1e35f4819082ac91d0e62e8c74_output.jsonl"  # Change this to the actual file path

# # List to store each parsed JSON object
# responses = []

# # Open and read the .jsonl file
# with open(headlines_batch_file_path, "r") as f:
#     for line in f:
#         # Parse the JSON object from each line
#         response = json.loads(line.strip())  # Use .strip() to remove any unwanted whitespace

#         # Extract relevant details for the DataFrame
#         custom_id = response.get("custom_id")
        
#         # Safely extract the result by checking if 'choices' is non-empty
#         choices = response.get("body", {}).get("choices", [])
#         result = None  # Default to None if no result is found
#         if choices:  # Check if 'choices' is not empty
#             result = choices[0].get("message", {}).get("content", None)
        
#         # Append the extracted data as a dictionary
#         responses.append({
#             "custom_id": custom_id,
#             "result": result
#         })

# # Create a DataFrame from the list of responses
# df = pd.DataFrame(responses)

# # Display the DataFrame
# print(df)


In [186]:
headlines_batch_file_path = "batch_67ca41acfc048190b0aba3f92084765d_output.jsonl"  # Change this to the actual file path
headlines_batch = []

with open(headlines_batch_file_path, "r") as f:
    for line in f:
        parsed_json = json.loads(line.strip())

        place_name = parsed_json.get("custom_id")
        choices = parsed_json.get("response", {}).get("body", {}).get("choices", [])
        response_content = choices[0]["message"]["content"] if choices else None

        phrases_list = []

        if response_content:
            # Split by commas and clean extra spaces, also strip quotation marks from the start and end
            phrases_list = [phrase.strip().strip('"').strip("'") for phrase in response_content.split(',')]

        # Append structured data
        headlines_batch.append({
            "place_name": place_name,
            "full_response": response_content,
            "line_1": phrases_list[0] if len(phrases_list) > 0 else None,
            "line_2": phrases_list[1] if len(phrases_list) > 1 else None,
            "line_3": phrases_list[2] if len(phrases_list) > 2 else None
        })
        
headlines_batch_df = pd.DataFrame(headlines_batch)

In [188]:
# headlines_batch

In [189]:
places_wgs84_with_response = places_wgs84.merge(headlines_batch_df, left_on="GAZETTEER_ENTRY.NAME1", right_on="place_name", how='left')

In [190]:
places_wgs84_with_response.head(10)

Unnamed: 0,GAZETTEER_ENTRY.NAME1,GAZETTEER_ENTRY.MOST_DETAIL_VIEW_RES,GAZETTEER_ENTRY.LEAST_DETAIL_VIEW_RES,GAZETTEER_ENTRY.SAME_AS_DBPEDIA,geometry,place_name,full_response,line_1,line_2,line_3
0,Beckton,20000,25000,http://dbpedia.org/resource/Beckton,POINT (0.05882 51.51583),Beckton,"Industrial heart, Thames riverside, Beckton's parks thrive",Industrial heart,Thames riverside,Beckton's parks thrive
1,Plaistow,18000,25000,"http://dbpedia.org/resource/Plaistow,_Newham",POINT (0.02419 51.52561),Plaistow,"Old railways and industry, Plaistow Park's green heart, Rich in Victorian architecture",Old railways and industry,Plaistow Park's green heart,Rich in Victorian architecture
2,Leyton Marshes,18000,25000,,POINT (-0.04541 51.56873),Leyton Marshes,"Green lungs of East London, wildflower meadows and old rivers, the site of historic cricket matches",Green lungs of East London,wildflower meadows and old rivers,the site of historic cricket matches
3,Stoke Newington,17000,60000,http://dbpedia.org/resource/Stoke_Newington,POINT (-0.08352 51.56118),Stoke Newington,"Historic church bells ring, vibrant markets hum, ancient woodlands flourish",Historic church bells ring,vibrant markets hum,ancient woodlands flourish
4,Finsbury Park,17000,25000,"http://dbpedia.org/resource/Finsbury_Park,_London",POINT (-0.10889 51.56452),Finsbury Park,"Victorian park with lively markets, Historic concert venue at The Dome, Diverse food scene along Stroud Green",Victorian park with lively markets,Historic concert venue at The Dome,Diverse food scene along Stroud Green
5,Highgate,17000,25000,http://dbpedia.org/resource/Highgate,POINT (-0.15285 51.5778),Highgate,"Historic village charm, Famous for its cemetery, Scenic views from Parliament Hill",Historic village charm,Famous for its cemetery,Scenic views from Parliament Hill
6,East Finchley,25000,26000,http://dbpedia.org/resource/East_Finchley,POINT (-0.16633 51.585),East Finchley,"Newly built homes, old village charm, Victorian architecture, BBC Radiophonic Workshop birthplace, leafy parks and green spaces",Newly built homes,old village charm,Victorian architecture
7,Hampstead Garden Suburb,16000,25000,http://dbpedia.org/resource/Hampstead_Garden_Suburb,POINT (-0.18115 51.58118),Hampstead Garden Suburb,"Quaint streets and tree-lined avenues, Arts and Crafts architecture, Exploring Hampstead Heath trails",Quaint streets and tree-lined avenues,Arts and Crafts architecture,Exploring Hampstead Heath trails
8,Golders Green,17000,25000,http://dbpedia.org/resource/Golders_Green,POINT (-0.19712 51.58098),Golders Green,"Humble roots, Jewish heritage, lively street markets",Humble roots,Jewish heritage,lively street markets
9,Hendon,29000,60000,http://dbpedia.org/resource/Hendon,POINT (-0.22859 51.58262),Hendon,"Historic railway town charm, Famous for Hendon Aerodrome, Green parks and quiet streets",Historic railway town charm,Famous for Hendon Aerodrome,Green parks and quiet streets


In [191]:
places_wgs84_with_response.to_file("data/places_wgs84_with_response.geojson", driver="GeoJSON")

In [70]:
# # Load environment variables from the .env file
# load_dotenv()
# # Access the API key
# chatgpt_api_key = os.getenv('CHATGPT_API_KEY_1')
# client = openai.OpenAI(api_key=chatgpt_api_key)

In [112]:
# for index, row in places_wgs84.iterrows():
#     place_name = row["GAZETTEER_ENTRY.NAME1"]
    
#     # Defining messages for ChatGPT
#     message_headlines = f"Give 3 distinct epithets that capture the historic narrative of {place_name}, each one unique and specific."

#     # Ensure ai_headlines is empty before making an API call
#     if row["ai_headlines"] is None or pd.isna(row["ai_headlines"]):  
#         completion = client.chat.completions.create(
#             model="gpt-4o-mini",
#             store=False,
#             messages=[
#                 {"role": "user", "content": message_headlines}
#             ]
#         )
#         # Correctly updating the DataFrame
#         places_wgs84.loc[index, "ai_headlines"] = completion.choices[0].message.content

In [113]:
# !pip install --upgrade openai

In [None]:
# client = OpenAI(api_key=chatgpt_api_key)

# completion = client.chat.completions.create(
#   model="gpt-4o-mini",
#   store=False,
#   messages=[
#     {"role": "user", "content": "write a haiku about ai"}
#   ]
# )

# print(completion.choices[0].message);

In [467]:
# wiki_gdf_dedupped['wiki_headline'] = None
# wiki_gdf_dedupped['wiki_categories'] = None

In [468]:
# places_gdf['place_headline'] = "lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"
# places_gdf['place_categories'] = "royal, riverside, history"

In [473]:
# #Reprojecting gdf for mapbox
# places_gdf_wgs84 = places_gdf.to_crs(4258)
# print(places_gdf_wgs84.crs)
# print(places_gdf_wgs84.iloc[2]["geometry"]) #ensuring the coordinates have transformed
# places_gdf_wgs84.to_file("data/places_wgs84.geojson", driver="GeoJSON") 

EPSG:4258
POINT (0.0772798084689046 51.50684263876633)


In [460]:
# places_gdf_wgs84