In [None]:
import pandas as pd

data = pd.read_csv("processed_headlines_clustered.csv", index_col=0).drop(
    columns=["accented_city", "pop"])

data[['headline', 'cities', 'latitude', 'longitude', 'countrycode', 'cluster']].head()

In [None]:
data.info()

In [None]:
us_data = data[data["countrycode"] == "US"].copy()
world_data = data[data["countrycode"] != "US"].copy()

print(f"There are {len(us_data)} headlines in the US.")
print(f"There are {len(world_data)} headlines outside the US.")

In [None]:
def cluster_location(clusterer, data, lat_string="latitude", lon_string="longitude"):
    """
    Fit a clustering algorithm on location data.
    """
    features = data[[lat_string, lon_string]].copy()
    clusterer.fit(features)
    # Assign the cluster labels
    data["cluster"] = clusterer.labels_
    return data

In [None]:
from sklearn.cluster import DBSCAN

us_clusterer = DBSCAN(eps=3, min_samples=10)
us_data = cluster_location(us_clusterer, us_data)
us_data['cluster'].value_counts()

In [None]:
world_clusterer = DBSCAN(eps=10, min_samples=8)
world_data = cluster_location(world_clusterer, world_data)
world_data['cluster'].value_counts()

In [None]:
import math


def find_centroid(data):
    """
    Calculate the centroid of geographic points.
    
    Based on https://stackoverflow.com/a/57346455/5755357
    """

    x = 0
    y = 0
    z = 0

    # Iterate through each coordinate in radians
    for longitude, latitude in zip(data["longitude"], data["latitude"]):
        longitude, latitude = math.radians(longitude), math.radians(latitude)
        # Convert to a 3D position in radians
        x += math.cos(latitude) * math.cos(longitude)
        y += math.cos(latitude) * math.sin(longitude)
        z += math.sin(latitude)

    # Get the averages of each radians
    n = len(data)
    x /= n
    y /= n
    z /= n

    # Apply formula
    central_longitude = math.atan2(y, x)
    central_square_root = math.sqrt(x ** 2 + y ** 2)
    central_latitude = math.atan2(z, central_square_root)

    # Convert back to degrees
    centroid = dict(
        latitude=math.degrees(central_latitude),
        longitude=math.degrees(central_longitude),
    )
    return centroid

In [None]:
find_centroid(world_data[world_data['cluster'] == 1])

In [None]:
import numpy as np

def haversine_formula_two_arrays(
    longitude_one, latitude_one, longitude_two, latitude_two
):
    """
    Calculate the Great Circle distance between two points 
    using the Haversine Formula. Latitude and Longitude are in degrees.
    
    Source: https://stackoverflow.com/a/4913653/5755357
    """
    # Convert to radians
    longitude_one, latitude_one, longitude_two, latitude_two = map(
        np.radians, [longitude_one, latitude_one, longitude_two, latitude_two]
    )

    # Apply Haversine formula
    delta_longitude = longitude_two - longitude_one
    delta_latitude = latitude_two - latitude_one
    # Formula
    a = (
        np.sin(delta_latitude / 2) ** 2
        + np.cos(latitude_one) * np.cos(latitude_two) * np.sin(delta_longitude / 2) ** 2
    )
    # Convert from radians to km
    d = 2 * np.arcsin(np.sqrt(a))
    radius_miles = 3956
    return d * radius_miles