In [1]:
import pandas as pd
import dhlab as dh
import tools_imag as ti
import dhlab.api.dhlab_api as api
import requests
from io import StringIO

In [3]:
def geo_locations_corpus(dhlabids):
    res = requests.post(f"{dh.constants.BASE_URL}/imagination_geo_data_list", json={"dhlabids":list(dhlabids)})
    if res.status_code == 200:
        data = pd.read_json(StringIO(res.text))
    else:
        print(res.status_code)
        data = pd.DataFrame()
    return data

In [2]:
df = pd.read_excel('imag_korpus.xlsx', index_col = 0)

In [3]:
df.columns

Index(['author', 'category', 'oversatt', 'place', 'publisher', 'title', 'urn',
       'year', 'dhlabid'],
      dtype='object')

In [4]:
df.sample(2).values

array([['Lie, Mons', 'Diktning: Dramatikk', 0, 'Christiania',
        'I Kommission hos Swanström', 'Tragedier om Kjærlighed',
        'URN:NBN:no-nb_digibok_2006111601016', 1897, 100614395],
       ['Strindberg, August', 'Diktning: Dramatikk', 0, 'Kjøbenhavn',
        'Gyldendal', 'Erik XIV : Skuespil i fire Akter',
        'URN:NBN:no-nb_digibok_2009040203010', 1899, 100637671]],
      dtype=object)

In [5]:
places_corpus = df.sample(min(10, len(df)))

places = ti.geo_locations_corpus(places_corpus.dhlabid)

In [7]:
places.columns

Index(['index', 'dhlabid', 'token', 'frekv', 'rank', 'score', 'geonameid',
       'name', 'alternatename', 'latitude', 'longitude', 'feature_class',
       'feature_code', 'spurious'],
      dtype='object')

In [8]:
places.values[:2]

array([[14242, 100614969, 'Bergen', 1, 1, 0.6659141498000001, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0],
       [14243, 100614969, 'Jeru', 1, 1, 0.6000000000000001, 6773104,
        'Jeru', 'Jeru', -7.9934, 112.7612, 'P', 'PPL', 0]], dtype=object)

In [10]:
places[places.token=="Bergen"].values

array([[14242, 100614969, 'Bergen', 1, 1, 0.6659141498000001, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0],
       [69470, 100624342, 'Bergen', 1, 1, 0.8435832388000001, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0],
       [405856, 100618347, 'Bergen', 3, 1, 0.8658194749, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0],
       [251052, 100635886, 'Bergen', 7, 1, 1.0690449850000001, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0],
       [894196, 100632576, 'Bergen', 10, 1, 1.6659141499999999, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0],
       [596702, 100617408, 'Bergen', 13, 1, 0.9742796653, 3161733,
        'Bergen', 'Bergen', 60.392, 5.328, 'A', 'ADM2', 0]], dtype=object)

In [13]:
def calculate_place_stats(places_data):
    """Calculate frequency and dispersion metrics for places"""
    # Group by place name to combine statistics
    place_stats = {}
    
    for name, group in places_data.groupby('name'):
        total_freq = group['frekv'].sum()
        docs = set(group['dhlabid'])
        dispersion = len(docs) / len(places_data['dhlabid'].unique())
        score = total_freq * dispersion
        
        place_stats[name] = {
            'freq': total_freq,
            'docs': docs,
            'dispersion': dispersion,
            'score': score,
            'lat': group['latitude'].iloc[0],
            'lon': group['longitude'].iloc[0],
            'token': group['token'].iloc[0]
        }
    
    return place_stats
    
def preprocess_places(corpus_df):
    """Create a preprocessed places file with all statistics"""
    all_places = ti.geo_locations_corpus(corpus_df.dhlabid)
    all_places = all_places[all_places['rank']==1]
    
    place_stats = calculate_place_stats(all_places)
    
    # Convert to DataFrame for easier storage/loading
    places_df = pd.DataFrame([
        {
            'name': name,
            'token': stats['token'],
            'freq': stats['freq'],
            'dispersion': stats['dispersion'],
            'score': stats['score'],
            'lat': stats['lat'],
            'lon': stats['lon'],
            'docs': list(stats['docs'])  # Convert set to list for storage
        }
        for name, stats in place_stats.items()
    ])
    
    places_df.to_pickle('preprocessed_places.pkl')
    return places_df

NameError: name 'calculate_place_stats' is not defined