In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from haversine import haversine
from itertools import izip
import csv

In [2]:
column_names = [
    "lat", 
    "lon", 
    "id", 
    "source_id", 
    "account_id", 
    "title", 
    "created_on", 
    "updated_on", 
    "start_ts", 
    "until_ts",
    "report_type",
    "notes", 
    "layer_id",
    "severity"
]

target_columns = [
    "lat",
    "lon",
    "id",
    "title",
    "start_ts",
    "report_type",
]

In [3]:
data_filepath = "../data/2016/all_2016.txt"

# Load sample dataset
df = pd.read_table(data_filepath, header=None, names=column_names)

# Drop rows with NaNs (this is admittedly ugly and the final model will be much more precise)
df.dropna(axis=0, how="any", inplace=True)

# Drop columns we won't need
df = df[target_columns]

# Convert timestamps from Unix Epoch time to Date Time Groups
df["start_ts"] = pd.to_datetime(df["start_ts"], unit="s", errors="ignore")

# Build severity score columns
# df["severity_score"] = df["severity"].map(severity_score)
# df["severity_quadratic"] = df["severity"].map(severity_score_quadratic)
# df["severity_log"] = df["severity"].map(severity_score_log)
# df["severity_exp"] = df["severity"].map(severity_score_exp)

# Trim reports from outside the specified date range
start = pd.to_datetime("2016-01-01")
end = pd.to_datetime("2017-01-01")
df = df[df["start_ts"] > start]
df = df[df["start_ts"] < end]

print df.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1377451 entries, 0 to 1767286
Data columns (total 7 columns):
lat            1377451 non-null float64
lon            1377451 non-null float64
id             1377451 non-null object
title          1377451 non-null object
start_ts       1377451 non-null datetime64[ns]
report_type    1377451 non-null object
severity       1377451 non-null object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 84.1+ MB
None


In [4]:
cities_df = pd.read_csv("../data/cities300000.csv")

print cities_df.info()
print cities_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1375 entries, 0 to 1374
Data columns (total 4 columns):
name            1375 non-null object
latitude        1375 non-null float64
longitude       1375 non-null float64
country_code    1375 non-null object
dtypes: float64(2), object(2)
memory usage: 43.0+ KB
None
             name  latitude  longitude country_code
0           Dubai  25.06570   55.17128           AE
1         Sharjah  25.33737   55.41206           AE
2          Al Ain  24.19167   55.76056           AE
3       Abu Dhabi  24.46667   54.36667           AE
4  Mazār-e Sharīf  36.70904   67.11087           AF


In [5]:
df["lat_long"] = zip(df["lat"], df["lon"])
cities_df["lat_long"] = zip(cities_df["latitude"], cities_df["longitude"])

In [6]:
city_label_indices = []
for report in df["lat_long"]:
    distances = [haversine(report, city) for city in cities_df["lat_long"]]
    city_label_indices.append(np.argmin(distances))
    
city_labels = []
for index in city_label_indices:
    city_labels.append(cities_df.ix[index, "name"])
    
labels_df = pd.DataFrame(city_labels)
labels_df.to_csv("../data/2016/2016_city_labels.csv", header=False, mode="w")

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
