# Clean and Save Cleaned Data

In [1]:
import pyspark
from pyspark.sql.types import * 

In [2]:
# sc = spark.sparkContext
spark = SparkSession(sc)

In [3]:
filepath = "s3a://final-kmeans/raw/sample_geo.txt"

In [8]:
# Create a dataframe and persist to prevent calling to the S3 bucket for every operation
df_sg = spark.read.csv(filepath,sep='\t',header=True)
df_sg.persist()

DataFrame[Latitude: string, Longitude: string, LocationID: string]

In [9]:
# View dataframe
df_sg.show(5)

+-----------+------------+----------+
|   Latitude|   Longitude|LocationID|
+-----------+------------+----------+
|37.77253945|-77.49954987|         1|
|42.09013298|-87.68915558|         2|
|39.56341754|-75.58753204|         3|
|39.45302347|-87.69374084|         4|
| 38.9537989|-77.01656342|         5|
+-----------+------------+----------+
only showing top 5 rows



In [10]:
# View dataframe schema to confirm proper datatypes
df_sg.schema

StructType(List(StructField(Latitude,StringType,true),StructField(Longitude,StringType,true),StructField(LocationID,StringType,true)))

In [11]:
df_sg = df_sg.selectExpr('cast(Latitude as float) as latitude', 'cast(Longitude as float) as longitude',
                          'cast(LocationID as integer) as location')

In [12]:
df_sg.show(5)

+---------+----------+--------+
| latitude| longitude|location|
+---------+----------+--------+
| 37.77254| -77.49955|       1|
|42.090134|-87.689156|       2|
| 39.56342| -75.58753|       3|
|39.453022| -87.69374|       4|
|  38.9538| -77.01656|       5|
+---------+----------+--------+
only showing top 5 rows



In [14]:
# Remove datapoints with likely false locations and count to verify items have been removed
df_sg = df_sg.filter((df_sg.latitude != 0.0) & (df_sg.longitude != 0.0))

In [15]:
# Save dataframe to S3 bucket as a csv file
outpath = 's3a://final-kmeans/clean'
# Do not write header?
df_sg.write.csv(outpath+'/synthetic.csv')