In [45]:
import pandas as pd

photos = pd.read_csv("/Users/benjaminluff/code/benluff303/SwipeSense/photos.csv")
keywords = pd.read_csv("/Users/benjaminluff/code/benluff303/SwipeSense/keywords.csv")
conversions = pd.read_csv("/Users/benjaminluff/code/benluff303/SwipeSense/conversions.csv")   # optional
collections = pd.read_csv("/Users/benjaminluff/code/benluff303/SwipeSense/collections.csv")   # optional
colors = pd.read_csv("/Users/benjaminluff/code/benluff303/SwipeSense/colors.csv")             # optional

In [46]:
photos_df = pd.DataFrame(photos)

In [47]:
keywords_df = pd.DataFrame(keywords)

In [48]:
collections_df = pd.DataFrame(collections)

In [49]:
colors_df = pd.DataFrame(colors)

In [50]:
merged_df = photos_df.merge(keywords_df, on="photo_id", how="left")

print(merged_df.head())

      photo_id                                photo_url  \
0  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   
1  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   
2  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   
3  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   
4  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   

                                     photo_image_url  \
0  https://images.unsplash.com/uploads/1411949294...   
1  https://images.unsplash.com/uploads/1411949294...   
2  https://images.unsplash.com/uploads/1411949294...   
3  https://images.unsplash.com/uploads/1411949294...   
4  https://images.unsplash.com/uploads/1411949294...   

           photo_submitted_at photo_featured  photo_width  photo_height  \
0  2014-09-29 00:08:38.594364              t         4272          2848   
1  2014-09-29 00:08:38.594364              t         4272          2848   
2  2014-09-29 00:08:38.594364              t         4272          2848   
3  2014-

In [51]:
# I am grouping all keywords into a list per photo

keywords_grouped = keywords_df.groupby("photo_id")["keyword"].apply(list).reset_index()

# Then I merge it back

photos_plus_keywords = photos_df.merge(keywords_grouped, on="photo_id", how="left")

print(photos_plus_keywords.head())

      photo_id                                photo_url  \
0  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   
1  rDLBArZUl1c  https://unsplash.com/photos/rDLBArZUl1c   
2  cNDGZ2sQ3Bo  https://unsplash.com/photos/cNDGZ2sQ3Bo   
3  iuZ_D1eoq9k  https://unsplash.com/photos/iuZ_D1eoq9k   
4  BeD3vjQ8SI0  https://unsplash.com/photos/BeD3vjQ8SI0   

                                     photo_image_url  \
0  https://images.unsplash.com/uploads/1411949294...   
1  https://images.unsplash.com/photo-141633941111...   
2  https://images.unsplash.com/photo-142014251503...   
3  https://images.unsplash.com/photo-141487280988...   
4  https://images.unsplash.com/photo-141700759404...   

           photo_submitted_at photo_featured  photo_width  photo_height  \
0  2014-09-29 00:08:38.594364              t         4272          2848   
1   2014-11-18 19:36:57.08945              t         3000          4000   
2  2015-01-01 20:02:02.097036              t         2564          1710   
3  2014-

In [52]:
travel_keywords = pd.read_csv("/Users/benjaminluff/code/benluff303/SwipeSense/Travel_keywords_26_Aug.csv", header=None)[0].str.lower().tolist()
travel_keywords = set(travel_keywords)

print(len(travel_keywords))

1751


In [53]:
travel_keywords_df = pd.DataFrame(travel_keywords)

In [54]:
travel_keywords_df['norm'] = travel_keywords_df[0].dropna().str.strip().str.lower()

In [55]:
dupes = merged_df[merged_df["photo_id"].duplicated(keep=False)]

In [56]:
# Total number of duplicated rows (beyond each initial instance)

num_dupes = merged_df["photo_id"].duplicated().sum()
print("Number of duplicate rows:", num_dupes)

# Total number of photo_ids that occur more than once

num_ids_with_dupes = (merged_df["photo_id"].value_counts() > 1).sum()
print("Photo IDs with duplicates:", num_ids_with_dupes)

Number of duplicate rows: 2609889
Photo IDs with duplicates: 25000


In [57]:
merged_df["keyword"] = (
    merged_df["keyword"]
    .astype(str)   # ensuring all entries are strings
    .str.strip()   # remove spaces
    .str.lower()   # convert to lowercase
)

In [58]:
print(merged_df.shape)

(2634889, 35)


In [59]:
filter_set = set(travel_keywords_df["norm"])
print("Filter set size:", len(filter_set))

Filter set size: 1751


In [60]:
filtered_df = merged_df[merged_df["keyword"].isin(filter_set)]
print(filtered_df.shape)

(165301, 35)


In [61]:
# Total number of duplicated rows (beyond the first instance)

num_dupes = filtered_df["photo_id"].duplicated().sum()
print("Number of duplicate rows:", num_dupes)

# Total number of photo_ids that occur more than once

num_ids_with_dupes = (filtered_df["photo_id"].value_counts() > 1).sum()
print("Photo IDs with duplicates:", num_ids_with_dupes)

Number of duplicate rows: 144399
Photo IDs with duplicates: 18613


In [62]:
filtered_df["photo_id"].value_counts().head(10)

photo_id
9oXw9OCGBFY    38
51dAtAlho8c    38
MtRVfdKWbPI    35
ZBUesmAQapY    34
lDIFWfKzqAg    33
BkmdKnuAZtw    33
8JzoJyt3hyM    32
tk2ROSG9Lyo    32
WB-tCcWcLYs    32
oJmVzkVStoo    31
Name: count, dtype: int64

In [63]:
# 1) IDs that survived my keyword filter

keep_ids = filtered_df["photo_id"].unique()
len(keep_ids)  # ~18903

# 2) Aggregates built ONLY from the filtered rows

kw_per_photo = (filtered_df.groupby("photo_id")["keyword"]
                .apply(lambda s: sorted(set(s.str.strip().str.lower())))
                .rename("matched_keywords"))

kw_count = (filtered_df.groupby("photo_id")["keyword"]
            .count()
            .rename("kw_count"))

# 3) Start from the kept IDs and attach photo metadata (inner join)

photos_subset = photos_df[photos_df["photo_id"].isin(keep_ids)].drop_duplicates("photo_id").set_index("photo_id")

per_photo = (photos_subset
             .join([kw_per_photo, kw_count], how="inner")  # inner keeps only IDs present in aggregates
             .reset_index())

per_photo.shape


(20902, 33)

In [64]:
per_photo["photo_id"].nunique()
(per_photo["matched_keywords"].isna()).sum()

# I want 0 here

0

In [65]:
mask = pd.Series(True, index=per_photo.index)

# Location metadata

#mask &= per_photo["photo_location_country"].notna() | per_photo["photo_location_city"].notna()

curated = per_photo[mask].copy()
curated.shape


#other things I could try...
# require at least 2 matched travel keywords
#mask &= per_photo["kw_count"].fillna(0) >= 2

# popularity
#if "stats_downloads" in per_photo.columns:
   # mask &= per_photo["stats_downloads"].fillna(0) >= 5

(20902, 33)

In [None]:
curated.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,matched_keywords,kw_count
0,XMyPniM9LF0,https://unsplash.com/photos/XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,2014-09-29 00:08:38.594364,t,4272,2848,1.5,Woman exploring a forest,michellespencer77,...,2375421,6967,woman walking in the middle of forest,,,,,L56bVcRRIWMh.gVunlS4SMbsRRxr,"[adventure, building, forest, journey, jungle,...",7
1,cNDGZ2sQ3Bo,https://unsplash.com/photos/cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,2015-01-01 20:02:02.097036,t,2564,1710,1.5,Rural winter mountainside,johnprice,...,1302461,3428,rocky mountain under gray sky at daytime,,,,,LhMj%NxvM{t7_4t7aeoM%2M{ozj[,"[glacier, mountain, mountain range, plateau]",4
2,BeD3vjQ8SI0,https://unsplash.com/photos/BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,2014-11-26 13:13:50.134383,t,4896,3264,1.5,Silhouette near dark trees,jonaseriksson,...,8704860,49662,trees during night time,,,,,L25|_:V@0hxtI=W;odae0ht6=^NG,"[camping, city, forest, jungle, park]",5
3,dO0KS_QGnzY,https://unsplash.com/photos/dO0KS_QGnzY,https://images.unsplash.com/uploads/1411476843...,2014-09-23 12:56:00.965116,t,2816,2112,1.33,Riding lessons in the mountains,raychsy,...,2040630,8503,riding people on horses during daytime,Mount Bromo,-7.93775,112.952476,49.84942,LqBOm;WCocWWu6WBoea#IBaef8fk,"[beach, coast, desert, glacier, journey, mount...",11
4,ocwmWiNAWGs,https://unsplash.com/photos/ocwmWiNAWGs,https://images.unsplash.com/reserve/m6rT4MYFQ7...,2014-06-03 16:22:34,t,2310,1534,1.51,Grass and morning sun,jakegivens,...,25112307,306708,scenery of grass and trees,,,,,LmIhKix[4:M|~qt7IUWBtRRjofof,[park],1


In [74]:
curated.photo_image_url.iloc[0]

'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3'

In [72]:
url = "https://unsplash.com/photos/GBjvTQTou2D0KTHomMoA_IMG_1568-4531b5a7"

row = curated.loc[curated["photo_url"] == url, ["matched_keywords", "photo_description"]]

if row.empty:
    print("No match found — check the URL string carefully")
else:
    print("Keywords:", row["matched_keywords"].iloc[0])
    print("Description:", row["photo_description"].iloc[0])

No match found — check the URL string carefully


In [67]:
curated[curated["photo_id"].duplicated(keep=False)].sort_values("photo_id")

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,matched_keywords,kw_count


In [68]:
from google.cloud import storage

# Force explicit key usage
client = storage.Client.from_service_account_json(
    "/Users/benjaminluff/code/benluff303/swipe-sense/keys/swipe-space-470211-b6cdb6bfb18b.json"
)

bucket = client.bucket("swipe-bucket")


In [69]:
import os
import time
import requests
from tqdm import tqdm
from urllib.parse import urlparse
from google.cloud import storage

# --- CONFIG ---
DF = curated                      # your DataFrame
URL_COL = "photo_image_url"       # the column in your DataFrame
BUCKET_NAME = "swipe-bucket"
REQUEST_TIMEOUT = 20
RETRY_TIMES = 3
SLEEP_BETWEEN = 0.2
HEADERS = {"User-Agent": "curated-downloader/1.0"}
#ROW_LIMIT = 1000
# ---------------

# Initialise GCS client
storage_client = storage.Client.from_service_account_json(
    "/Users/benjaminluff/code/benluff303/swipe-sense/keys/swipe-space-470211-b6cdb6bfb18b.json"
)

bucket = storage_client.bucket(BUCKET_NAME)

def make_filename(url: str) -> str:
    path = urlparse(url).path
    base = os.path.basename(path).split("?")[0] or "image"
    stem, ext = os.path.splitext(base)
    if not ext:
        ext = ".jpg"
    return "".join(c for c in stem if c.isalnum() or c in "-_.") + ext

def upload_to_gcs(blob_name: str, content: bytes) -> None:
    blob = bucket.blob(blob_name)
    blob.upload_from_string(content, content_type="image/jpeg")

def download_and_upload(url: str, blob_name: str) -> bool:
    last_exc = None
    for attempt in range(1, RETRY_TIMES + 1):
        try:
            with requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True) as r:
                if r.status_code == 200:
                    upload_to_gcs(blob_name, r.content)
                    return True
                elif r.status_code in (429, 503):
                    time.sleep(min(60, 2 ** attempt))  # backoff
                else:
                    last_exc = RuntimeError(f"HTTP {r.status_code}")
        except Exception as e:
            last_exc = e
            time.sleep(2 ** attempt * 0.5)
    print(f"FAILED: {url} -> gs://{BUCKET_NAME}/{blob_name} ({last_exc})")
    return False

# --- MAIN LOOP ---
for url in tqdm(DF[URL_COL].head(ROW_LIMIT), desc="Uploading to GCS"):
    blob_name = make_filename(url)
    download_and_upload(url, blob_name)
    time.sleep(SLEEP_BETWEEN)


Uploading to GCS:   4%|▍         | 42/1000 [02:05<47:49,  3.00s/it] 


KeyboardInterrupt: 

In [None]:
from google.cloud import storage

client = storage.Client.from_service_account_json(
    "/Users/benjaminluff/code/benluff303/swipe-sense/keys/swipe-space-470211-b6cdb6bfb18b.json"
)

bucket = client.bucket("swipe-bucket")
blob = bucket.blob("hello.txt")
blob.upload_from_string("Hello from Ben!")

print("✅ Upload worked:", blob.public_url)

✅ Upload worked: https://storage.googleapis.com/swipe-bucket/hello.txt


In [None]:
len(curated)

20902

In [None]:
import os
import time
import requests
import pandas as pd
from tqdm import tqdm
from urllib.parse import urlparse
from google.cloud import storage

# --- CONFIG ---
DF = curated                      # your DataFrame (~20,902 rows)
URL_COL = "photo_image_url"       # column with image URLs
BUCKET_NAME = "swipe-bucket"
REQUEST_TIMEOUT = 20
RETRY_TIMES = 3
SLEEP_BETWEEN = 0.2               # delay between requests
HEADERS = {"User-Agent": "curated-downloader/1.0"}
FAILED_LOG = "failed_uploads.csv"
# ---------------

# Initialise GCS client with explicit service account key
storage_client = storage.Client.from_service_account_json(
    "/Users/benjaminluff/code/benluff303/swipe-sense/keys/swipe-space-470211-b6cdb6bfb18b.json"
)
bucket = storage_client.bucket(BUCKET_NAME)

# Keep a list of failures to save later
failed = []

def make_filename(url: str) -> str:
    """Create a safe filename from the image URL"""
    path = urlparse(url).path
    base = os.path.basename(path).split("?")[0] or "image"
    stem, ext = os.path.splitext(base)
    if not ext:
        ext = ".jpg"
    return "".join(c for c in stem if c.isalnum() or c in "-_.") + ext

def upload_to_gcs(blob_name: str, content: bytes) -> None:
    """Upload a file to GCS"""
    blob = bucket.blob(blob_name)
    blob.upload_from_string(content, content_type="image/jpeg")

def download_and_upload(url: str, blob_name: str) -> bool:
    """Download an image from URL and upload to GCS (resumable)"""
    blob = bucket.blob(blob_name)
    if blob.exists():   # 🔑 Skip if already uploaded
        return False

    last_exc = None
    for attempt in range(1, RETRY_TIMES + 1):
        try:
            with requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True) as r:
                if r.status_code == 200:
                    upload_to_gcs(blob_name, r.content)
                    return True
                elif r.status_code in (429, 503):
                    time.sleep(min(60, 2 ** attempt))  # backoff
                else:
                    last_exc = RuntimeError(f"HTTP {r.status_code}")
        except Exception as e:
            last_exc = e
            time.sleep(2 ** attempt * 0.5)
    # Record failure
    print(f"FAILED: {url} -> gs://{BUCKET_NAME}/{blob_name} ({last_exc})")
    failed.append({"url": url, "blob": blob_name, "error": str(last_exc)})
    return False

# --- MAIN LOOP ---
for url in tqdm(DF[URL_COL], desc="Uploading to GCS", total=len(DF)):
    blob_name = make_filename(url)
    download_and_upload(url, blob_name)
    time.sleep(SLEEP_BETWEEN)

# --- SAVE FAILURES ---
if failed:
    pd.DataFrame(failed).to_csv(FAILED_LOG, index=False)
    print(f"\n⚠️ {len(failed)} uploads failed. See {FAILED_LOG} for details.")
else:
    print("\n✅ All uploads completed successfully!")


Uploading to GCS:   0%|          | 0/20902 [00:00<?, ?it/s]

Uploading to GCS:   5%|▌         | 1147/20902 [07:35<2:09:26,  2.54it/s]

FAILED: https://images.unsplash.com/photo-1589449633813-6b4d11dcb853 -> gs://swipe-bucket/photo-1589449633813-6b4d11dcb853.jpg (HTTP 404)


Uploading to GCS:   6%|▌         | 1282/20902 [08:32<2:10:42,  2.50it/s]


KeyboardInterrupt: 