In [153]:
!pip install s3fs



In [160]:
import boto3
from botocore.exceptions import ClientError
import configparser
import json
import pandas as pd

config = configparser.ConfigParser()
config.read('aws.cfg')

aws_access_key = config['AWS']['aws_access_key_id']
aws_secret_key = config['AWS']['aws_secret_access_key']


In [2]:
# Create a Bedrock Runtime client in the AWS Region you want to use.
client = boto3.client("bedrock-runtime", region_name="us-east-1", aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

# Set the model ID
model_id = "amazon.titan-text-express-v1:0"

In [155]:
# Read external landmark data from S3
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key
)

landmarks = s3.download_file('capstone-techcatalyst-conformed', 'group1/landmark_data/part-00000-tid-8270335314995015129-7c07741e-3c51-4b23-9a75-95c6c641450b-65-1-c000.csv', 'part-00000-tid-8270335314995015129-7c07741e-3c51-4b23-9a75-95c6c641450b-65-1-c000.csv')
landmark_df = pd.read_csv('/workspaces/thePANDAsCapstone/Pipeline Stages/Databricks/part-00000-tid-8270335314995015129-7c07741e-3c51-4b23-9a75-95c6c641450b-65-1-c000.csv')

In [158]:
# Build dictionary which maps the landmark name to the landmark description to be used with Bedrock
landmark_list = landmark_df['Landmark_Name'].to_list()
description_list = landmark_df['USE_ORIG'].to_list()
build_type = landmark_df['BuildType'].to_list()
landmark_dict = {landmark_list[i]: description_list[i] for i in range(len(landmark_list))}


In [21]:
#DO NOT RERUN
# Collects all the landmark names categorized
categories = {}


In [10]:
# Function that calls Bedrock client by taking in query, running it through the model, and returns the output string

def bedrock(query):
    user_message = query
    conversation = [
        {
            "role": "user",
            "content": [{"text": user_message}],
        }
    ]

    try:
        # Send the message to the model, using a basic inference configuration.
        response = client.converse(
            modelId="amazon.titan-text-express-v1",
            messages=conversation,
            inferenceConfig={"maxTokens":4096,"stopSequences":["User:"],"temperature":0,"topP":1},
            additionalModelRequestFields={}
        )


        # Extract and print the response text.
        response_text = response["output"]["message"]["content"][0]["text"]
        print(response_text)

    except (ClientError, Exception) as e:
        print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
        exit(1)

    return response_text

In [80]:
# Function that provides the query to the Bedrock function and parses the returned string in JSON format to extract the category of the landmark

def categorize(start, end, categories):
    print(f'rows {start} - {end}')
    query = f""" {dict(list(landmark_dict.items())[start:end])}

    Assign one of each one of these names to one of these categories using the description, choosing from: Architecture, Religion, Transportation, Education, Civic, Entertainment, Financial, Commercial, Industrial, Historic. Leave empty if none of the categories match. Return output in JSON format with keys "Name" and "Category". Return only one category for every name with nothing in the output except the JSON object.

    Use the following descriptions for the categories:
    Architecture: infrastructure, structure
    Religion: churches, religious
    Transportation: airports, railroads, bridges
    Education: school, university
    Civic: government, military
    Entertainment: arts, sports, theaters, museum, recreation, amusement
    Financial: banks
    Commercial: businesses, shopping, hotel
    Industrial: factories
    Historic: cemetery, relic


"""
    response_text = bedrock(query)


    pos = response_text.find('{')
    pos2 = response_text.find('[')

    if response_text[pos] == '{':
        json_output = json.loads(response_text[pos:-3])
        obj = json_output['rows']
    elif response_text[pos2] == '[':
        json_output = json.loads(response_text[pos2:-3])
        obj = json_output



    names = [x['Name'] for x in obj] 
    category = [x['Category'] for x in obj] 

    categorized = {names[i]: category[i] for i in range(len(names))}

    categories.update(categorized)
    return categorized


In [91]:
# Loops through the landmarks 10 at a time

for i in range(10, len(landmark_dict.keys()), 10):    
    categorize(i-10, i, categories)


372
rows 372 - 373
```tabular-data-json
{
    "rows": [
        {
            "Name": "167-171 John Street Building",
            "Category": "Commercial"
        }
    ]
}
```
[{'Name': '167-171 John Street Building', 'Category': 'Commercial'}]
373
rows 373 - 374
```tabular-data-json
{
    "rows": [
        {
            "Name": "Saint John's Church",
            "Category": "Religion"
        }
    ]
}
```
[{'Name': "Saint John's Church", 'Category': 'Religion'}]
374
rows 374 - 375


KeyboardInterrupt: 

In [93]:
# Validation that all landmarks in original landmark dictionary have been categorizezd
diff = landmark_dict.keys() - categories.keys()

remaining = {}
for i, x in enumerate(list(landmark_dict.keys())):
    if x in diff:
        remaining[x] = i

remaining

In [95]:
categories

{'Brooklyn Bridge': 'Architecture',
 'Public School 15 (Daniel D. Tompkins School)': 'Education',
 'Samuel Gompers Industrial High School': 'Education',
 'American Tract Society Building': 'Commercial',
 'Fire Engine Company No. 31': 'Civic',
 'Steinway Hall': 'Commercial',
 'New York Public Library, Aguilar Branch': 'Civic',
 'Guardian Life Insurance Company of America Annex': 'Commercial',
 'Hearst Magazine Building': 'Commercial',
 'Oliver Street Baptist Church': 'Religion',
 'Brooklyn Trust Company Building': 'Commercial, bank',
 'Fire Engine Company No. 33': 'Architecture',
 'Staten Island Savings Bank Building': 'Financial',
 'Moore-Jackson Cemetery': 'Historic',
 'Cary Building': 'Commercial',
 'City Hall': 'Civic',
 'Cooper Union': 'Education',
 'Williamsburg Branch, Public National Bank of New York Building': 'Financial',
 'Kings County Savings Bank': 'Financial',
 'Daily News Building': 'Commercial',
 'Barrymore Theater': 'Entertainment',
 'New York County National Bank': 'Fi

In [109]:
# Convert to pandas dataframe
categories_df = pd.DataFrame({'Name': categories.keys(), 'Category': categories.values()})

In [112]:
# Initial cleaning
name = categories_df['Name'].drop_duplicates()
categories_df['Category'] = categories_df['Category'].apply(lambda x: x.split(',')[0])

In [127]:
categories_df['Category'].value_counts()

Category
Commercial         200
Civic              113
Religion            84
Institutional       49
Religious           32
Architecture        31
Financial           28
Entertainment       25
Education           24
Industrial          21
Historic            13
Cemetery            10
Transportation       8
Infrastructure       6
Hotel                2
Recreational         1
Mausoleum            1
Office Building      1
Movie Theater        1
Bank                 1
Mixed-use            1
Amusement            1
Educational          1
Name: count, dtype: int64

In [134]:
# Consolidation of categories

categories_df['Category'] = categories_df['Category'].replace(to_replace=['Mixed-use','Office Building', 'Hotel'], value='Commercial')
categories_df['Category'] = categories_df['Category'].replace(to_replace=['Educational', 'Education'], value='Civic')
categories_df['Category'] = categories_df['Category'].replace(to_replace=['Religious'], value='Religion')
categories_df['Category'] = categories_df['Category'].replace(to_replace=['Bank'], value='Financial')
categories_df['Category'] = categories_df['Category'].replace(to_replace=['Movie Theater', 'Amusement', 'Recreational'], value='Entertainment')
categories_df['Category'] = categories_df['Category'].replace(to_replace=['Mausoleum', 'Cemetery'], value='Historic')
categories_df['Category'] = categories_df['Category'].replace(to_replace=['Infrastructure'], value='Industrial')



In [135]:
categories_df['Category'].value_counts()

Category
Commercial        204
Civic             138
Religion          116
Institutional      49
Architecture       31
Financial          29
Entertainment      28
Industrial         27
Historic           24
Transportation      8
Name: count, dtype: int64

In [136]:
categories_df

Unnamed: 0,Name,Category
0,Brooklyn Bridge,Architecture
1,Public School 15 (Daniel D. Tompkins School),Civic
2,Samuel Gompers Industrial High School,Civic
3,American Tract Society Building,Commercial
4,Fire Engine Company No. 31,Civic
...,...,...
649,"Gould Memorial Library, New York University",Institutional
650,Fort Washington Presbyterian Church,Religion
651,Richard Cornell Graveyard,Historic
652,New York School of Applied Design for Women,Civic


In [157]:
# Write dataframe to S3

categorized_path = 's3://capstone-techcatalyst-conformed/group1/categorized/categorized_landmarks.csv'
categories_df.to_csv(categorized_path, header = 'True', storage_options={
                   'key' : aws_access_key,
                   'secret' : aws_secret_key
               })