In [1]:
import pandas as pd
from geopy import distance
from transformers import pipeline
from geopy.geocoders import Nominatim
from functools import cache
from geopy.extra.rate_limiter import RateLimiter

# Load the data from languages.csv
df_languages = pd.read_csv('languages.csv')

# Load the data from text.csv
df_text = pd.read_csv('text.csv')

# Display the first few rows of both DataFrames
print(df_languages.head())
print(df_text.head())


  from .autonotebook import tqdm as notebook_tqdm


  language_code        Country
0            ar          Egypt
1            bg       Bulgaria
2            de        Germany
3            el         Greece
4            en  United States
                               image_id        lat         lon  \
0  e42b78d1-d17e-4f2c-a8c2-b987234e3211 -25.363889  131.044922   
1  4b9a0f1c-2d4e-4f12-b919-12d34f56d7ab  48.856614    2.352222   
2  21c34e2a-b8ab-441e-a987-e12c34b56a99  35.689500  139.691700   
3  78df2310-a4ab-4e23-b890-c34d567890ab  52.370216    4.895167   
4  f12b34e5-1c2d-4d34-a098-b78901c2d345 -34.858076  -56.179683   

                                               text  
0  This is a beautiful sunset in Uluru, Australia.   
1                Bonjour! J'adore la Tour Eiffel. (  
2                              東京スカイツリーからの眺めは最高です!   
3             Ik fiets graag door de tulpenvelden.   
4         Disfrutando de las cataratas del Iguazú.   


In [3]:

im2gps = pd.read_json("~/notebooks/im2gps_small.jsonl", lines=True)
im2gps = im2gps.set_index('key')
im2gps3k = pd.read_json("~/notebooks/im2gps3ktest.jsonl", lines=True)
im2gps3k = im2gps3k.set_index('key')
yfcc4k = pd.read_json("~/notebooks/yfcc4k.jsonl", lines=True)
yfcc4k = yfcc4k.set_index('key')

im2gps.drop(columns=['full'], inplace=True)
im2gps3k.drop(columns=['full'], inplace=True)
yfcc4k.drop(columns=['full'], inplace=True)


im2gps['num_lines'] = im2gps['lines'].apply(len)
im2gps3k['num_lines'] = im2gps3k['lines'].apply(len)
yfcc4k['num_lines'] = yfcc4k['lines'].apply(len)

im2gps_filtered   = im2gps[im2gps['num_lines'] > 0]
im2gps3k_filtered = im2gps3k[im2gps3k['num_lines'] > 0]
yfcc4k_filtered   = yfcc4k[yfcc4k['num_lines'] > 0]


def join_with_newline(lines):
  return "\n".join(lines)

im2gps_filtered['text'] = im2gps_filtered['lines'].apply(join_with_newline)
im2gps3k_filtered['text'] = im2gps3k_filtered['lines'].apply(join_with_newline)
yfcc4k_filtered['text'] = yfcc4k_filtered['lines'].apply(join_with_newline)

im2gps_filtered = im2gps_filtered[["text", "lat", "lon"]]
im2gps3k_filtered = im2gps3k_filtered[["text", "lat", "lon"]]
yfcc4k_filtered = yfcc4k_filtered[["text", "lat", "lon"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  im2gps_filtered['text'] = im2gps_filtered['lines'].apply(join_with_newline)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  im2gps3k_filtered['text'] = im2gps3k_filtered['lines'].apply(join_with_newline)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yfcc4k_filtered['text'] = yfcc4k_filtered['lines'

In [4]:

# Load the language detection model
model_name = "papluca/xlm-roberta-base-language-detection"
classifier = pipeline("text-classification", model=model_name)

im2gps_filtered['language_code'] = im2gps_filtered['text'].apply(lambda x: classifier(x)[0]['label'])
im2gps3k_filtered['language_code'] = im2gps3k_filtered['text'].apply(lambda x: classifier(x)[0]['label'])
yfcc4k_filtered['language_code'] = yfcc4k_filtered['text'].apply(lambda x: classifier(x)[0]['label'])


im2gps_filtered = pd.merge(im2gps_filtered, df_languages, on='language_code', how='left')
im2gps3k_filtered = pd.merge(im2gps3k_filtered, df_languages, on='language_code', how='left')
yfcc4k_filtered = pd.merge(yfcc4k_filtered, df_languages, on='language_code', how='left')






In [6]:

geolocator = Nominatim(user_agent="Imperial RCS")
location = geolocator.geocode("United States")

In [13]:

# class CachedNominatim(Nominatim):
#   @cache
#   def geocode(self, query, **kwargs):
#     return super().geocode(query, **kwargs)

# geolocator = CachedNominatim(user_agent="Imperial RCS")
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
class CachedNominatim():
    
    def __init__(self, *args, **kwargs):
        self.cache = {}  # Create a custom cache dictionary
        self.geolocator = Nominatim(user_agent="Imperial RCS")
        self.geocode_limited = RateLimiter(self.geolocator.geocode, min_delay_seconds=1)  

    @cache
    def geocode(self, query):
        # Check cache miss before applying rate limit
        cache_key = query
        if cache_key not in self.cache:
            # Use the pre-created rate limiter instance
            print("cache miss: ", query)
            result = self.geocode_limited( query)
            self.cache[cache_key] = result
            return result
        else:
            # Cache hit, return cached result directly
            return self.cache[cache_key]


geolocator = CachedNominatim()
geocode = geolocator.geocode

In [14]:
im2gps_filtered['location'] = im2gps_filtered['Country'].apply(geocode)
im2gps3k_filtered['location'] = im2gps3k_filtered['Country'].apply(geocode)
yfcc4k_filtered['location'] = yfcc4k_filtered['Country'].apply(geocode)


cache miss:  Italy
cache miss:  Pakistan
cache miss:  Tanzania
cache miss:  India
cache miss:  China
cache miss:  Turkey
cache miss:  United States
cache miss:  Brazil
cache miss:  Greece
cache miss:  Netherlands
cache miss:  Poland
cache miss:  France
cache miss:  Mexico
cache miss:  Germany
cache miss:  Bulgaria
cache miss:  Thailand
cache miss:  Vietnam
cache miss:  Russia


In [15]:
def pred_lat(location):
    return location.latitude
def pred_lon(location):
    return location.longitude

def calculate_distance(row):
    coords_1 = (row['lat'], row['lon'])
    coords_2 = (row['pred_lat'], row['pred_lon'])
    return distance.distance(coords_1, coords_2).km


def calculate_stats(df_merged):
    df_merged['pred_lat'] = df_merged['location'].apply(pred_lat)
    df_merged['pred_lon'] = df_merged['location'].apply(pred_lon)
    df_merged['distance'] = df_merged.apply(calculate_distance, axis=1)
    return df_merged

im2gps_filtered =  calculate_stats(im2gps_filtered)
im2gps3k_filtered =  calculate_stats(im2gps3k_filtered)
yfcc4k_filtered =  calculate_stats(yfcc4k_filtered)


In [17]:
def summarize(df_merged):
    # Calculate the median distance.
    median_distance = df_merged['distance'].median()

    # Calculate the mean distance.
    mean_distance = df_merged['distance'].mean()

    # Print the median and mean distances.
    print("Median Distance:", median_distance)
    print("Mean Distance:", mean_distance)
    prop_within_1km = len(df_merged[df_merged['distance'] <= 1]) / len(df_merged)
    prop_within_25km = len(df_merged[df_merged['distance'] <= 25]) / len(df_merged)
    prop_within_200km = len(df_merged[df_merged['distance'] <= 200]) / len(df_merged)
    prop_within_750km = len(df_merged[df_merged['distance'] <= 750]) / len(df_merged)
    prop_within_2500km = len(df_merged[df_merged['distance'] <= 2500]) / len(df_merged)

    print("Proportion within 1km:", prop_within_1km)
    print("Proportion within 25km:", prop_within_25km)
    print("Proportion within 200km:", prop_within_200km)
    print("Proportion within 750km:", prop_within_750km)
    print("Proportion within 2500km:", prop_within_2500km)

print("IM2GPS")
summarize(im2gps_filtered)
print("IM2GPS3k")
summarize(im2gps3k_filtered)
print("yfcc4k")
summarize(yfcc4k_filtered)

IM2GPS
Median Distance: 6514.214320833995
Mean Distance: 6634.36427265605
Proportion within 1km: 0.0
Proportion within 25km: 0.0
Proportion within 200km: 0.0
Proportion within 750km: 0.04854368932038835
Proportion within 2500km: 0.18446601941747573
IM2GPS3k
Median Distance: 7184.524685788174
Mean Distance: 7263.247855827852
Proportion within 1km: 0.0
Proportion within 25km: 0.0
Proportion within 200km: 0.008764940239043825
Proportion within 750km: 0.054183266932270914
Proportion within 2500km: 0.18167330677290836
yfcc4k
Median Distance: 7287.917513986085
Mean Distance: 7229.197897405539
Proportion within 1km: 0.0006016847172081829
Proportion within 25km: 0.0012033694344163659
Proportion within 200km: 0.011432009626955475
Proportion within 750km: 0.05836341756919374
Proportion within 2500km: 0.2214199759326113


In [18]:
# df_merged.to_csv( "locations.csv")

yfcc4k_filtered

Unnamed: 0,text,lat,lon,language_code,Country,location,pred_lat,pred_lon,distance
0,NUGHA\n 7788,43.649708,-79.365637,hi,India,"(India, (22.3511148, 78.6677428))",22.351115,78.667743,12361.351902
1,MCITER\n OCIAR\n MvAe - à - -,-34.885742,138.604736,fr,France,"(France, (46.603354, 1.8883335))",46.603354,1.888334,16194.958596
2,EWART\n 1 Litre\n 900\n 800\n 700\n 600\n V2 ...,52.448235,-1.566066,de,Germany,"(Deutschland, (51.1638175, 10.4478313))",51.163818,10.447831,839.859364
3,SUNSHINE CATHEDRAL FOUNDATION\n Foundation\n ...,26.103065,-80.153190,en,United States,"(United States, (39.7837304, -100.445882))",39.783730,-100.445882,2419.296483
4,I - I\n I - n\n D D\n Minrgasie,50.109166,8.678833,sw,Tanzania,"(Tanzania, (-6.5247123, 35.7878438))",-6.524712,35.787844,6798.492499
...,...,...,...,...,...,...,...,...,...
1657,964\n I6ai,43.655529,-70.261573,sw,Tanzania,"(Tanzania, (-6.5247123, 35.7878438))",-6.524712,35.787844,11799.808882
1658,liele -\n bet\n mas\n dv,56.946819,24.106013,it,Italy,"(Italia, (42.6384261, 12.674297))",42.638426,12.674297,1786.196667
1659,IDGAF!,42.642280,-71.323228,hi,India,"(India, (22.3511148, 78.6677428))",22.351115,78.667743,12180.586059
1660,- FALL BRANCE FALS\n SREST\n GAAE\n TENND,34.786122,-84.304275,it,Italy,"(Italia, (42.6384261, 12.674297))",42.638426,12.674297,7997.745006
