In [None]:
import pandas as pd

data = pd.read_csv('processed_headlines_locations.csv', index_col=0)
data[['headline', 'cities', 'latitude', 'longitude', 'countrycode']].head()

In [None]:
print('Missing Values:')
data.isna().sum()

In [None]:
data.info()

In [None]:
from sklearn.cluster import DBSCAN

# Default sklearn parameters
clusterer = DBSCAN()
clusterer

In [None]:
def cluster_location(clusterer, data, lat_string="latitude", lon_string="longitude"):
    """
    Fit a clustering algorithm on location data.
    """
    features = data[[lat_string, lon_string]].copy()
    clusterer.fit(features)
    # Assign the cluster labels
    data["cluster"] = clusterer.labels_
    return data

In [None]:
data = cluster_location(clusterer, data)
data['cluster'].value_counts()

In [None]:
clusterer.eps = 9
data = cluster_location(clusterer, data)
data['cluster'].value_counts()

In [None]:
clusterer.min_samples = 3
data = cluster_location(clusterer, data)
data['cluster'].value_counts()

In [None]:
best_euclidean_clusterer = DBSCAN(
    eps=9, min_samples=3, metric="euclidean",
)
data = cluster_location(best_euclidean_clusterer, data)

In [None]:
import numpy as np

def great_circle_distance(coord1, coord2, radius=3956):
    """
    Calculates the great circle distance between two coordinates or arrays of coordinates.
    """
    if np.array_equal(coord1, coord2):
        return 0.0

    # Convert lat/lon to radians
    coord1, coord2 = np.radians(coord1), np.radians(coord2)
    # Find the difference between the coordinates
    delta_x, delta_y = coord2 - coord1
    
    # Apply Haversin formula
    haversin = np.sin(delta_x / 2) ** 2 + np.product(
        [np.cos(coord1[0]), np.cos(coord2[0]), np.sin(delta_y / 2) ** 2]
    )

    # Convert to distance in miles
    return 2 * radius * np.arcsin(haversin ** 0.5)

In [None]:
data.head()

In [None]:
coord1 = np.array(
    [
        [data["latitude"].iloc[0], data["latitude"].iloc[1]],
        [data["longitude"].iloc[0], data["longitude"].iloc[1]],
    ]
)

coord2 = np.array(
    [
        [data["latitude"].iloc[2], data["latitude"].iloc[0]],
        [data["longitude"].iloc[2], data["longitude"].iloc[0]],
    ]
)

# Calculate distance between points
great_circle_distance(coord1, coord2)

In [None]:
great_circle_clusterer = DBSCAN(
    eps=250, min_samples=4, metric=great_circle_distance,
)

# Cluster using great circle distance
data = cluster_location(
     great_circle_clusterer, data, lon_string="longitude", lat_string="latitude"
)

data["cluster"].value_counts()

In [None]:
manhattan_clusterer = DBSCAN(
    eps=9, min_samples=3, metric="manhattan",
)
data = cluster_location(manhattan_clusterer, data)

In [None]:
from sklearn.cluster import KMeans

kmeans_clusterer = KMeans()
kmeans_clusterer

In [None]:
inertia_values = []

for k in range(1, 11):
    inertia_values.append(KMeans(n_clusters=k).fit(data[['latitude', 'longitude']]).inertia_)

In [None]:
_ = data['cluster'].value_counts().plot.bar(title='Cluster Distribution')

In [None]:
data.groupby('countrycode')['cluster'].value_counts()['US']

In [None]:
data.to_csv('processed_headlines_clustered.csv')

In [None]:
data[['headline', 'cities', 'latitude', 'longitude', 'countrycode', 'cluster']].head(10)

In [None]:
data[['headline', 'cities', 'latitude', 'longitude', 'countrycode', 'cluster']].to_csv('clustered_data.csv')