In [2]:
import pandas as pd

data = pd.read_csv("processed_headlines_clustered.csv", index_col=0).drop(
    columns=["accented_city", "pop"])

data[['headline', 'cities', 'latitude', 'longitude', 'countrycode', 'cluster']].head()

Unnamed: 0,headline,cities,latitude,longitude,countrycode,cluster
0,Zika Outbreak Hits Miami,Miami,25.77427,-80.19366,US,0
1,Could Zika Reach New York City?,New York City,40.71427,-74.00597,US,0
2,First Case of Zika in Miami Beach,Miami Beach,25.79065,-80.13005,US,0
3,"Mystery Virus Spreads in Recife, Brazil",Recife,-8.05389,-34.88111,BR,2
4,Dallas man comes down with case of Zika,Dallas,32.78306,-96.80667,US,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603 entries, 0 to 646
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   headline     603 non-null    object 
 1   cities       603 non-null    object 
 2   latitude     603 non-null    float64
 3   longitude    603 non-null    float64
 4   countrycode  603 non-null    object 
 5   cluster      603 non-null    int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 33.0+ KB


In [4]:
us_data = data[data["countrycode"] == "US"].copy()
world_data = data[data["countrycode"] != "US"].copy()

print(f"There are {len(us_data)} headlines in the US.")
print(f"There are {len(world_data)} headlines outside the US.")

There are 303 headlines in the US.
There are 300 headlines outside the US.


In [5]:
def cluster_location(clusterer, data, lat_string="latitude", lon_string="longitude"):
    """
    Fit a clustering algorithm on location data.
    """
    features = data[[lat_string, lon_string]].copy()
    clusterer.fit(features)
    # Assign the cluster labels
    data["cluster"] = clusterer.labels_
    return data

In [6]:
from sklearn.cluster import DBSCAN

us_clusterer = DBSCAN(eps=3, min_samples=10)
us_data = cluster_location(us_clusterer, us_data)
us_data['cluster'].value_counts()

 1    72
 3    57
-1    51
 4    45
 0    42
 2    21
 5    10
 6     5
Name: cluster, dtype: int64

In [7]:
world_clusterer = DBSCAN(eps=10, min_samples=8)
world_data = cluster_location(world_clusterer, world_data)
world_data['cluster'].value_counts()

-1    76
 1    60
 2    59
 3    52
 0    22
 5    11
 4    11
 6     9
Name: cluster, dtype: int64

In [8]:
import math


def find_centroid(data):
    """
    Calculate the centroid of geographic points.
    
    Based on https://stackoverflow.com/a/57346455/5755357
    """

    x = 0
    y = 0
    z = 0

    # Iterate through each coordinate in radians
    for longitude, latitude in zip(data["longitude"], data["latitude"]):
        longitude, latitude = math.radians(longitude), math.radians(latitude)
        # Convert to a 3D position in radians
        x += math.cos(latitude) * math.cos(longitude)
        y += math.cos(latitude) * math.sin(longitude)
        z += math.sin(latitude)

    # Get the averages of each radians
    n = len(data)
    x /= n
    y /= n
    z /= n

    # Apply formula
    central_longitude = math.atan2(y, x)
    central_square_root = math.sqrt(x ** 2 + y ** 2)
    central_latitude = math.atan2(z, central_square_root)

    # Convert back to degrees
    centroid = dict(
        latitude=math.degrees(central_latitude),
        longitude=math.degrees(central_longitude),
    )
    return centroid

In [9]:
find_centroid(world_data[world_data['cluster'] == 1])

{'latitude': 48.67673171200486, 'longitude': 3.3638215785007146}

In [12]:
import numpy as np

def haversine_formula_two_arrays(
    longitude_one, latitude_one, longitude_two, latitude_two
):
    """
    Calculate the Great Circle distance between two points 
    using the Haversine Formula. Latitude and Longitude are in degrees.
    
    Source: https://stackoverflow.com/a/4913653/5755357
    """
    # Convert to radians
    longitude_one, latitude_one, longitude_two, latitude_two = map(
        np.radians, [longitude_one, latitude_one, longitude_two, latitude_two]
    )

    # Apply Haversine formula
    delta_longitude = longitude_two - longitude_one
    delta_latitude = latitude_two - latitude_one
    # Formula
    a = (
        np.sin(delta_latitude / 2) ** 2
        + np.cos(latitude_one) * np.cos(latitude_two) * np.sin(delta_longitude / 2) ** 2
    )
    # Convert from radians to km
    d = 2 * np.arcsin(np.sqrt(a))
    radius_miles = 3956
    return d * radius_miles