# Initial PySpark Implementation

### Imports

In [47]:
import py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from haversine import haversine
from itertools import izip
import pyspark as ps
from pyspark.sql.types import *
from pyspark.sql.types import StructType



### Spark Session and Initial Variables

In [87]:
spark = pyspark.sql.SparkSession.builder.master("local[4]").appName("Spark_EDA").getOrCreate()
sc = spark.sparkContext

In [144]:
schema = StructType([
    StructField("lat", FloatType(), True),
    StructField("lon", FloatType(), True),
    StructField("id", StringType(), True),
    StructField("source_id", StringType(), True),
    StructField("account_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("created_on", IntegerType(), True),
    StructField("updated_on", StringType(), True),
    StructField("start_ts", IntegerType(), True),
    StructField("until_ts", StringType(), True),
    StructField("report_type", StringType(), True),
    StructField("notes", StringType(), True),
    StructField("layer_id", StringType(), True),
    StructField("severity", StringType(), True)
])

target_columns = [
    "lat",
    "lon",
    "id",
    "title",
    "created_on",
    "start_ts",
    "report_type",
    "severity"
]

### Shell command to clean quoted newlines

awk -v RS='"' 'NR % 2 == 0 { gsub(/\n/, " ") } { printf("%s%s", $0, RT) }' input_file > output_file

In [145]:
data_filepath = "data/reports_12DEC16-26DEC16.tsv"

### Load Data

In [146]:
# Load Data
reports_df = spark.read.csv(data_filepath,
                         sep="\t",
                         schema=schema,
                         header=None,
                         quote='"')

# Drop Nulls - will revisit
reports_df = data_df.dropna()

# Keep only target columns
reports_df = data_df.select([column for column in target_columns])

# Create Lat/Long columns

# Create Severity Features columns

# Convert Timestamps to date time groups




In [147]:
reports_df.printSchema()
reports_df.take(1)

root
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- created_on: integer (nullable = true)
 |-- start_ts: integer (nullable = true)
 |-- report_type: string (nullable = true)
 |-- severity: string (nullable = true)



[Row(lat=37.98381042480469, lon=23.7275390625, id=u'KFg4bXSq5hGyhPkeGX1vrg', title=u'Security Message for U.S. Citizens: Athens (Greece), Planned Demonstration on November 14', created_on=1481760000, start_ts=1479132770, report_type=u'OSAC', severity=u'unrated')]

## Label Reports with Cities

### Load Cities Data and Build Lat/Long Column

### Calculate Haversine Distances

### Apply City Labels

## Time Series Analysis

Show several cities, then Berlin


## Conclusions