# AirBnB Image Analysis - using AWS Rekognition

In [1]:
import requests
import json
import boto3
import botocore
import pandas as pd
from io import BytesIO

In [8]:
image_urls = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0


## Amazon S3

In [3]:
s3_client = boto3.client('s3')
s3 = boto3.resource('s3')

In [4]:
# view existing buckets
for bucket in s3.buckets.all():
    print(bucket.name)

2018-01-22-sagemaker-ajms
acloudgurutesting-a
airbnb-competition


In [5]:
BUCKET_NAME = 'airbnb-competition' # replace with your bucket name
bucket = s3.Bucket(BUCKET_NAME)

In [None]:
image_urls = df['']

In [9]:
for ind, row in image_urls.iterrows():
    img_id = row['id']
    img_url = row['thumbnail_url'].replace('small','large')
    print(img_url)

https://a0.muscache.com/im/pictures/6d7cbbf7-c034-459c-bc82-6522c957627c.jpg?aki_policy=large
https://a0.muscache.com/im/pictures/348a55fe-4b65-452a-b48a-bfecb3b58a66.jpg?aki_policy=large
https://a0.muscache.com/im/pictures/6fae5362-9e3a-4fa9-aa54-bbd5ea26538d.jpg?aki_policy=large
https://a0.muscache.com/im/pictures/72208dad-9c86-41ea-a735-43d933111063.jpg?aki_policy=large


AttributeError: 'float' object has no attribute 'replace'

In [10]:
# put images into s3 bucket

image_urls = df[['id','thumbnail_url']][0:1]

for ind, row in image_urls.iterrows():
    img_id = row['id']
    img_url = row['thumbnail_url'].replace('small','large')

    # download image
    img_data = requests.get(img_url).content
    
    # create a key name for the image (used in s3 bucket)
    img_key_name = 'raw_data/images/listing_{}.jpg'.format(img_id)
    print(img_key_name)
    
    # load image to s3 bucket
    s3_client.put_object(Body=img_data, Bucket=BUCKET_NAME, Key=img_key_name)

raw_data/images/listing_6901257.jpg


## AWS Rekognition

In [23]:
rekognition_client = boto3.client('rekognition')

In [24]:
for obj in bucket.objects.all():
    key = obj.key
    filename = key.replace('.jpg','.txt').replace('raw_data','in_progress')
    print(filename)

in_progress/images/listing_6901257.txt
in_progress/
in_progress/images/listing_6901257.txt


In [39]:
# get images labels (rekognition) and put them in in_progress s3 bucket

for obj in bucket.objects.all():
    # collect raw images in s3
    key = obj.key
    filename = key.replace('.jpg','.txt').replace('raw_data','in_progress')
    
    if '.jpg' in key:
        # use rekognition to get labels for images
        response = rekognition_client.detect_labels(
            Image={'S3Object':{'Bucket':BUCKET_NAME
                               ,'Name':key}}
        )    
        img_labels = str({key:response['Labels']})
        
        # load image labels to s3 bucket
        s3_client.put_object(Body=img_labels, Bucket=BUCKET_NAME, Key=filename)

In [26]:
img_labels

"{'raw_data/images/listing_6901257.jpg': [{'Name': 'Flora', 'Confidence': 99.30709838867188}, {'Name': 'Jar', 'Confidence': 99.30709838867188}, {'Name': 'Plant', 'Confidence': 99.30709838867188}, {'Name': 'Potted Plant', 'Confidence': 99.30709838867188}, {'Name': 'Pottery', 'Confidence': 99.30709838867188}, {'Name': 'Vase', 'Confidence': 99.30709838867188}, {'Name': 'Hearth', 'Confidence': 88.10081481933594}, {'Name': 'Indoors', 'Confidence': 88.08829498291016}, {'Name': 'Room', 'Confidence': 88.08829498291016}, {'Name': 'Fireplace', 'Confidence': 76.58438873291016}, {'Name': 'Office', 'Confidence': 57.91788101196289}, {'Name': 'Interior Design', 'Confidence': 57.2894172668457}, {'Name': 'Kitchen', 'Confidence': 57.2894172668457}, {'Name': 'Furniture', 'Confidence': 57.1688117980957}, {'Name': 'Sideboard', 'Confidence': 54.756832122802734}, {'Name': 'Electronics', 'Confidence': 54.4898796081543}, {'Name': 'Entertainment Center', 'Confidence': 54.4898796081543}, {'Name': 'Box', 'Confide

In [190]:
# example image result
s = """{"Records":[{"eventVersion":"2.0","eventSource":"aws:s3","awsRegion":"us-east-1","eventTime":"2018-02-05T02:07:54.383Z","eventName":"ObjectCreated:Put","userIdentity":{"principalId":"AWS:AIDAJMHK7DP5NVOLFPNPS"},"requestParameters":{"sourceIPAddress":"24.126.7.133"},"responseElements":{"x-amz-request-id":"23F8CC5606EAF66A","x-amz-id-2":"iXcwd4HsFmps4wSh2C1k7lh8qW+6YMDRrZYPApTH137Ph2NdTykdTBNHq9TLyZdCX7DVFaQZe/c="},"s3":{"s3SchemaVersion":"1.0","configurationId":"add_image","bucket":{"name":"airbnb-competition","ownerIdentity":{"principalId":"A2IHE3SC8NO8AN"},"arn":"arn:aws:s3:::airbnb-competition"},"object":{"key":"raw_data/images/listing_6976.jpg","size":27403,"eTag":"3ec001fb09e8b9963e671fc1fb34b5c1","sequencer":"005A77BC7A2F5A9A4E"}}}]}"""
json.loads(s)['Records'][0]['s3']['object']['key']

'raw_data/images/listing_6976.jpg'

In [61]:
s = dict({'EventSource': 'aws:sns', 'EventVersion': '1.0', 'EventSubscriptionArn': 'arn:aws:sns:us-east-1:923344594760:airbnb_image_analysis:86de8758-065e-4baf-a11c-67da23d168d5', 'Sns': {'Type': 'Notification', 'MessageId': 'c4e8b123-fe4a-53cf-b0b4-e3df6a7dc719', 'TopicArn': 'arn:aws:sns:us-east-1:923344594760:airbnb_image_analysis', 'Subject': 'Amazon S3 Notification', 'Message': '{"Records":[{"eventVersion":"2.0","eventSource":"aws:s3","awsRegion":"us-east-1","eventTime":"2018-02-13T07:21:27.767Z","eventName":"ObjectCreated:Put","userIdentity":{"principalId":"AWS:AIDAJMHK7DP5NVOLFPNPS"},"requestParameters":{"sourceIPAddress":"24.126.7.133"},"responseElements":{"x-amz-request-id":"1ED527E322245FF9","x-amz-id-2":"P8uf+EbvwA4NgPCzkqzokL5OhVt+f7z+FBmxdnDj1m9UNwsNUjsfUlW6LJFu9FF9vsQr97ogjSs="},"s3":{"s3SchemaVersion":"1.0","configurationId":"add_image","bucket":{"name":"airbnb-competition","ownerIdentity":{"principalId":"A2IHE3SC8NO8AN"},"arn":"arn:aws:s3:::airbnb-competition"},"object":{"key":"raw_data/images/listing_6901257.jpg","size":32799,"eTag":"a8bec03177a7390fbe82fb54e97d7f90","sequencer":"005A8291F750613258"}}}]}', 'Timestamp': '2018-02-13T07:21:27.823Z', 'SignatureVersion': '1', 'Signature': 'GxNuBe74Kpf3Uncd0BexPnAjkjqRmogKtpPJP4pX6cW1ZLsdoTUXQQ53hwdOhhGDAMhAaqJMLuRJGMtj4iLHQJcbfbPoHVrNQ9YYg3Ug7576WJKa1ptN/Sj1lbqnEiLXKfyviRwF/YhVvhlmMVVxUvkeuCii3YZ78PnxAZjZg9p0cYrJXRjxdb/kZiKWZgf8jBDfSWBsbM5nVgfHf/0Qvg5vM/B8KQBfR9nDg43TeR0KSrCzerqNVMRUVOKMPVideCqmakg/9tRR8qidaE9CfAawYPBW4xzbiGoksUDG02qgLYbXsSVPWLlHCTkFpnhfF1jgz42qyqNxY4lynck01A==', 'SigningCertUrl': 'https://sns.us-east-1.amazonaws.com/SimpleNotificationService-433026a4050d206028891664da859041.pem', 'UnsubscribeUrl': 'https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:923344594760:airbnb_image_analysis:86de8758-065e-4baf-a11c-67da23d168d5', 'MessageAttributes': {}}})
result = s['Sns']['Message']
json.loads(result)['Records'][0]['s3']['object']['key']
#s['s3']['object']['key']

'raw_data/images/listing_6901257.jpg'

## AWS Comprehend

In [None]:
summary = df[['name','summary','space','description','neighborhood_overview']]
summary = summary.iloc[0,:].sum()

client = boto3.client('comprehend')

response = client.detect_entities(
    Text=summary,
    LanguageCode='en'
)
response

phrase = client.detect_key_phrases(
    Text=summary,
    LanguageCode='en'
)
phrase

import json
text = json.dumps(phrase)
with open('listing1.txt', 'w') as f:
    f.write(text)

with open('listing1.txt', 'rb') as data:
    bucket.upload_fileobj(data, 'in_progress/listing1')