# PreProcessing CSV Files for use with Tensorflow

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import IntegerType

In [2]:
# initiates a SparkContext which is necessary for accessing data in Spark
sc = SparkContext()
sqlContext = SQLContext(sc)
# change to match your environment
# data_dir = "Data/merge_data"
# just gonna keep it commented out instead of remove
data_dir = '/home/cole/Workspace/School/Capstone/data/first_data_set/TestData/'

In [3]:
# this cell is not your part so don't worry about the issue
# loading all of them does not do a proper merge, compare the columns
df = sqlContext.read.format('com.databricks.spark.csv').option("header", "true").load([data_dir +"/SS_POOLS.csv", data_dir +"/SD_CHUNK_LOCATIONS.csv", data_dir +"/ARCHIVE_OBJECTS.csv"])

# need to load each individually then merge
SDRO = sqlContext.read.format('com.databricks.spark.csv').option("header", "true").load([data_dir + '/SD_RECON_ORDER.csv'])
SS_POOLS = sqlContext.read.format('com.databricks.spark.csv').option("header", "true").load([data_dir + '/SS_POOLS.csv'])
AFBF = sqlContext.read.format('com.databricks.spark.csv').option("header", "true").load([data_dir+"/AF_BITFILES.csv"])
BACKUP_OBJECTS = sqlContext.read.format('com.databricks.spark.csv').option("header", "true").load([data_dir+"/BACKUP_OBJECTS.csv"])

# could probably be a one liner, one of the above not used for this example
full_outer_join = BACKUP_OBJECTS.join(AFBF, BACKUP_OBJECTS.OBJID == AFBF.BFID,how='left')
full_outer_join = full_outer_join.join(SDRO, ['OBJID'],how='left') 

print("All loaded at once: {} Merge: {}".format(len(df.columns), len(full_outer_join.columns)))

All loaded at once: 10 Merge: 49


In [4]:
# reset for memory
SDRO = None
SS_POOLS = None
AFBF = None
BACKUP_OBJECTS = None

The cell below takes the four columns that we want and writes casts them as integers. We then select each of the four features and write the new data frame to the desired folder.

In [5]:
# almost all operations create a copy so you need to assign
# df.withColumn("POOLID", df["POOLID"].cast("int"))
# df.withColumn("SIZE", df["SIZE"].cast("int"))
# df.withColumn("OFFSET", df["OFFSET"].cast("int"))
# df.withColumn("LENGTH", df["LENGTH"].cast("int"))

# df.select("POOLID", "SIZE", "OFFSET", "LENGTH").write.options(header='true').format('com.databricks.spark.csv').save("Data/merge_data/4_features")

In [6]:
# assign
df = full_outer_join # this line is pure laziness
df = df.withColumn("POOLID", df["POOLID"].cast("int"))
df = df.withColumn("ATTRLENGTH", df["ATTRLENGTH"].cast("int"))
df = df.withColumn("BFSIZE", df["BFSIZE"].cast("int"))
df = df.withColumn("HDRSIZE", df["HDRSIZE"].cast("int"))
df = df.withColumn("OBJID", df["OBJID"].cast("int"))

In [7]:
# probably the more important piece missed, need to remove gaps in data
df = df.filter(df.POOLID. isNotNull())
df = df.filter(df.ATTRLENGTH. isNotNull())
df = df.filter(df.BFSIZE. isNotNull())
df = df.filter(df.HDRSIZE. isNotNull())
df = df.filter(df.OBJID. isNotNull())

In [8]:
df.select("OBJID", "POOLID", "ATTRLENGTH", "BFSIZE", "HDRSIZE").write.options(header='true').format('com.databricks.spark.csv').save(data_dir + "/merge_data/4_features")

AnalysisException: 'path file:/home/cole/Workspace/School/Capstone/data/first_data_set/TestData/merge_data/4_features already exists.;'

The cell below then goes and reads the CSV files generated by the cell above and prints out all the new column names and types to make sure we outputted what we wanted.

In [None]:
test_df = sqlContext.read.format('com.databricks.spark.csv').option("header", "true").load(data_dir + "/merge_data/4_features/*.csv")
test_df.columns

In [None]:
sc.stop()