In [1]:
import pandas as pd
import os
import requests

In [2]:
train_file = 'train(1).csv'
test_file = 'test2.csv'

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

#understanding the dataset we are dealing with
print(f"Total Train Data Samples: {len(train_df)}")
print(f"Total Test Data Samples:  {len(test_df)}")
display(train_df.head())

Total Train Data Samples: 16209
Total Test Data Samples:  5404


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,9117000170,20150505T000000,268643,4,2.25,1810,9240,2.0,0,0,...,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240
1,6700390210,20140708T000000,245000,3,2.5,1600,2788,2.0,0,0,...,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605
2,7212660540,20150115T000000,200000,4,2.5,1720,8638,2.0,0,0,...,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455
3,8562780200,20150427T000000,352499,2,2.25,1240,705,2.0,0,0,...,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750
4,7760400350,20141205T000000,232000,3,2.0,1280,13356,1.0,0,0,...,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071


In [3]:
#checking for duplicates
train_d = train_df['id'].duplicated().sum()
test_d = test_df['id'].duplicated().sum()

print(f"train duplicates: {train_d}")
print(f"test duplicates: {test_d}")

train_req = (len(train_df) - train_d)
test_req = (len(test_df) - test_d)

print(f"Required train images : {train_req}")
print(f"Required test images : {test_req}")

train duplicates: 99
test duplicates: 8
Required train images : 16110
Required test images : 5396


In [4]:
#making folders to download the satellite images
os.makedirs('images/train', exist_ok=True)
os.makedirs('images/test', exist_ok=True)

In [5]:
#constructing url for image fetiching using mapbox api
api = "pk.eyJ1IjoiYWFyeWFuMTYwNyIsImEiOiJjbWpkcG92N3EwNW9uM2RzNmdwM2JqZXgyIn0.5_N4q18qEJhbakLGQg4vKQ"

def download_mapbox_image(lat, lon, property_id, folder, api):
    
    url = f"https://api.mapbox.com/styles/v1/mapbox/satellite-v9/static/{lon},{lat},18,0/400x400?access_token={api}"
    save_path = os.path.join(folder, f"{property_id}.jpg")
    
    if os.path.exists(save_path):
        return "Exists"
    
    try:
        response = requests.get(url, timeout=15)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return "Success"
        else:
            return f"Error {response.status_code}"
    except Exception as e:
        return f"Failed: {str(e)}"

In [6]:
api = "pk.eyJ1IjoiYWFyeWFuMTYwNyIsImEiOiJjbWpkcG92N3EwNW9uM2RzNmdwM2JqZXgyIn0.5_N4q18qEJhbakLGQg4vKQ"

#downloading train images
print(f"Downloading {train_req} training images)")

train_count = 0

for index, row in train_df.iterrows():
    result = download_mapbox_image(
        row['lat'], 
        row['long'], 
        row['id'], 
        'images/train', 
        api
    )
    
    train_count += 1
    if train_count % 500 == 0:
        print(f" {train_count} / {train_req} images have been downloaded")
    
if "429" in result:
        print("Approaching too fast, slowing down")
        time.sleep(1)

print("\nTraining data has been downloaded")

# downloading test images
print(f"Downloading {test_req} test images)")

test_count = 0

for index, row in test_df.iterrows():
    result = download_mapbox_image(
        row['lat'], 
        row['long'], 
        row['id'], 
        'images/test', 
        api
    )
    
    test_count += 1
    if test_count % 500 == 0:
        print(f"{test_count} / {test_req} images have been downloaded")
    
if "429" in result:
        print("Approaching too fast, slowing down")
        time.sleep(1)

print("\nTesting data has been downloaded")
print(f"Total images in train folder: {len(os.listdir('images/train'))}")
print(f"Total images in test folder: {len(os.listdir('images/test'))}")

Downloading 16110 training images)
 500 / 16110 images have been downloaded
 1000 / 16110 images have been downloaded
 1500 / 16110 images have been downloaded
 2000 / 16110 images have been downloaded
 2500 / 16110 images have been downloaded
 3000 / 16110 images have been downloaded
 3500 / 16110 images have been downloaded
 4000 / 16110 images have been downloaded
 4500 / 16110 images have been downloaded
 5000 / 16110 images have been downloaded
 5500 / 16110 images have been downloaded
 6000 / 16110 images have been downloaded
 6500 / 16110 images have been downloaded
 7000 / 16110 images have been downloaded
 7500 / 16110 images have been downloaded
 8000 / 16110 images have been downloaded
 8500 / 16110 images have been downloaded
 9000 / 16110 images have been downloaded
 9500 / 16110 images have been downloaded
 10000 / 16110 images have been downloaded
 10500 / 16110 images have been downloaded
 11000 / 16110 images have been downloaded
 11500 / 16110 images have been downloa