# Named entity recognition

In [1]:
import sys
sys.path.append('..')
from ballitoreproject import *
from ballitoreproject.geography import *

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x2869a5ea0>

In [3]:
sent = 'This will be handed thee I suppose by Cd Morgan who intends for Clonmel tomorrow'
doc = nlp(sent)
doc

This will be handed thee I suppose by Cd Morgan who intends for Clonmel tomorrow

In [4]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Cd Morgan PERSON
Clonmel GPE
tomorrow DATE


In [5]:
def get_places(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_=='GPE']

get_places(sent)

['Clonmel']

In [8]:
df_smpl = get_data()

In [9]:
df_smpl['places']=[get_named_places_for_id(id) for id in tqdm(df_smpl.index)]
df_smpl['places']

Reading txt files: 100%|██████████| 4080/4080 [00:00<00:00, 9931.26it/s]
100%|██████████| 4560/4560 [00:04<00:00, 1011.18it/s]


id
mss4-b1-f7-001-176                                          []
mss4-b1-f3-005                                              []
mss4-b1-f7-001-3                                            []
mss4-b1-f3-005-006                                        [us]
mss4-b1-f7-001-15                                           []
                                          ...                 
consensus_text_90345943                         [Margt, Lilia]
consensus_text_90345944                        [Margt, Selina]
consensus_text_90345945    [Selina, Mrs Rigbys, Selina, Margt]
consensus_text_90345946               [Margt, Rotunda, Selina]
consensus_text_90345947                                     []
Name: places, Length: 4560, dtype: object

In [11]:
all_places = pd.Series(place for places in df_smpl.places for place in places).value_counts()
all_places.head(25)

Dublin        943
London        511
Margt         420
Ireland       390
Cousin        279
England       247
Clonmel       202
Waterford     183
Britain       149
America       131
Belfast       112
SB             86
Scotland       84
Providence     81
Debby          78
Paris          74
Lydia          71
Limerick       71
Molly          70
Selina         66
Liverpool      66
Richmond       64
P.S.           61
Kingstown      57
Edinburgh      56
Name: count, dtype: int64

In [None]:
# !pip install geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ballitoreproject")

In [None]:
loc=geolocator.geocode('Clonmel')
loc

In [None]:
loc.raw

In [None]:
geolocator.geocode('Clonmel', exactly_one=False)

In [None]:
geolocator.geocode('Waterford', exactly_one=False)

In [None]:
# !pip install diskcache

In [None]:
import diskcache as dc
cache_obj = dc.Cache(os.path.join(PATH_DATA,'geocache'))

@cache_obj.memoize()
def get_place_data(placename):
    loc = geolocator.geocode(placename)
    if not loc: return {}
    outd = {**loc.raw}
    outd['lat']=float(outd['lat'])
    outd['lon']=float(outd['lon'])
    return outd

get_place_data('Clonmel')

In [None]:
# places_ld=[]
# for place,count in tqdm(list(all_places.items())):
#     place_d={'place':place, 'count':count, **get_place_data(place)}
#     places_ld.append(place_d)
# places_df=pd.DataFrame(places_ld).dropna()
# places_df

In [None]:
# !pip install folium

In [None]:
import pandas as pd
import folium

def create_map_with_markers(df):
    """
    Creates a folium map with markers sized proportionally to the count values in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing 'place', 'count', 'lat', 'lon' columns.
    
    Returns:
    folium.Map: Interactive map with markers.
    """
    # Create a folium map centered around the mean latitude and longitude
    map_center = [df['lat'].mean(), df['lon'].mean()]
    fmap = folium.Map(location=map_center, zoom_start=2)

    # Define a function to scale the marker sizes
    def scale_size(count, min_count, max_count, min_size=3, max_size=15):
        return ((count - min_count) / (max_count - min_count)) * (max_size - min_size) + min_size

    # Get min and max count values for scaling
    min_count = df['count'].min()
    max_count = df['count'].max()

    # Add markers to the map
    for _, row in df.iterrows():
        marker_size = scale_size(row['count'], min_count, max_count)
        folium.CircleMarker(
            location=(row['lat'], row['lon']),
            radius=marker_size,
            popup=f"{row['place']}: {row['count']}",
            color='blue',
            fill=True,
            fill_color='blue'
        ).add_to(fmap)

    return fmap

fmap = create_map_with_markers(places_df)
fmap
