# Final Project - Pre-Processing

In [1]:
import re
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import types, Row, Column
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
# read in raw text set and write to parquet
# train = spark.read.option('header', 'false').csv('data/train.txt', sep='\t')
# train.write.format('parquet').save('data/train.parquet')

In [36]:
# read in parqet
train = spark.read.parquet('data/train.parquet')

In [37]:
# rename label column
train = train.withColumnRenamed('_c0', 'label')

# remove underscores
for c in train.columns[1:]:
    train = train.withColumnRenamed(c, c.strip('_'))

for i,c in enumerate(train.columns[1:14]):
    newName = 'n' + str(i)
    train = train.withColumnRenamed(c, newName)
    
for i,c in enumerate(train.columns[14:]):
    newName = 'c' + str(i)
    train = train.withColumnRenamed(c, newName)

train.columns

['label',
 'n0',
 'n1',
 'n2',
 'n3',
 'n4',
 'n5',
 'n6',
 'n7',
 'n8',
 'n9',
 'n10',
 'n11',
 'n12',
 'c0',
 'c1',
 'c2',
 'c3',
 'c4',
 'c5',
 'c6',
 'c7',
 'c8',
 'c9',
 'c10',
 'c11',
 'c12',
 'c13',
 'c14',
 'c15',
 'c16',
 'c17',
 'c18',
 'c19',
 'c20',
 'c21',
 'c22',
 'c23',
 'c24',
 'c25']

In [38]:
# cast numerical is float
for c in train.columns[:14]:
    train = train.withColumn(c, train[c].cast('float'))
train.printSchema()

root
 |-- label: float (nullable = true)
 |-- n0: float (nullable = true)
 |-- n1: float (nullable = true)
 |-- n2: float (nullable = true)
 |-- n3: float (nullable = true)
 |-- n4: float (nullable = true)
 |-- n5: float (nullable = true)
 |-- n6: float (nullable = true)
 |-- n7: float (nullable = true)
 |-- n8: float (nullable = true)
 |-- n9: float (nullable = true)
 |-- n10: float (nullable = true)
 |-- n11: float (nullable = true)
 |-- n12: float (nullable = true)
 |-- c0: string (nullable = true)
 |-- c1: string (nullable = true)
 |-- c2: string (nullable = true)
 |-- c3: string (nullable = true)
 |-- c4: string (nullable = true)
 |-- c5: string (nullable = true)
 |-- c6: string (nullable = true)
 |-- c7: string (nullable = true)
 |-- c8: string (nullable = true)
 |-- c9: string (nullable = true)
 |-- c10: string (nullable = true)
 |-- c11: string (nullable = true)
 |-- c12: string (nullable = true)
 |-- c13: string (nullable = true)
 |-- c14: string (nullable = true)
 |-- c15: st

In [54]:
# grab a sample
s = train.sample(False, 0.001)
s.count()

45848

In [55]:
s.head()

Row(label=0.0, n0=None, n1=12.0, n2=2.0, n3=7.0, n4=65252.0, n5=None, n6=None, n7=7.0, n8=None, n9=None, n10=None, n11=None, n12=7.0, c0='05db9164', c1='207b2d81', c2='80b25c0d', c3='8fa3275d', c4='25c83c98', c5='7e0ccccf', c6='800354a7', c7='1f89b562', c8='7cc72ec2', c9='668f09d5', c10='c851b930', c11='8e15c24e', c12='a98bc52f', c13='b28479f6', c14='3c767806', c15='8e2fc5b4', c16='07c540c4', c17='395856b0', c18='21ddcdc9', c19='b1252a9d', c20='09192c91', c21=None, c22='3a171ecb', c23='abd6caea', c24='001f3601', c25='aae032ad')

# Train/Test Split

In [56]:
trainSample, testSample = s.randomSplit([1.0, 9.0], 666)
trainSample = trainSample.cache()
testSample = testSample.cache()

In [57]:
trainSample.count(), testSample.count()

(4678, 41170)

# Normalize numerical data

In [58]:
%%time
stats = trainSample[trainSample.columns[1:14]].describe()
maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
maxes = [float(m) for m in maxes]
mins = [float(m) for m in mins]

CPU times: user 20 ms, sys: 20 ms, total: 40 ms
Wall time: 2.04 s


In [59]:
# normalize columns
for i,c in enumerate(trainSample.columns[1:14]):
    trainSample = trainSample.withColumn(c, (trainSample[c] - mins[i]) / (maxes[i] - mins[i]))

In [60]:
trainSample.select(trainSample.columns[1:14]).head()

Row(n0=None, n1=0.0, n2=0.00047080979284369113, n3=0.012658227848101266, n4=0.0045070462648655095, n5=0.01849537615596101, n6=0.0014814814814814814, n7=0.03768844221105527, n8=0.014874371859296482, n9=None, n10=0.022727272727272728, n11=None, n12=0.005154639175257732)

In [61]:
# need to decide on proper NA handling later on, for now filling with 0
trainSample = trainSample.na.fill(0, subset=trainSample.columns[1:14])

In [62]:
trainSample.select(trainSample.columns[1:14]).head()

Row(n0=0.0, n1=0.0, n2=0.00047080979284369113, n3=0.012658227848101266, n4=0.0045070462648655095, n5=0.01849537615596101, n6=0.0014814814814814814, n7=0.03768844221105527, n8=0.014874371859296482, n9=0.0, n10=0.022727272727272728, n11=0.0, n12=0.005154639175257732)

# Categorical manipulation

In [63]:
# counts = trainSample.select(trainSample.columns[14:]).summary('count')

In [64]:
# c = trainSample.groupBy('c0').count()
# c.collect()[0]
# # type(c)
# # c = c.orderBy(c.count.desc()).collect()

In [65]:
# def findInfrequentValues(c, n=10):
#     # c is the column that we are operating on
#     # 
#     counts = trainSample.groupBy(c).count()
#     infrequentValues = counts.filter(counts['count'] <= n)
#     s = infrequentValues.agg(F.collect_set(c)).collect()[0][0]
#     return s

In [66]:
# names = replaceInfrequentValues('c0')
# type(names)

In [67]:
# df = trainSample.withColumn('c0', F.when(trainSample['c0'] == names[0], '999').otherwise(trainSample['c0']))

In [68]:
# df.select(df['c0'] == names[0]).collect()

In [69]:
# names[0]

In [70]:
# int('0x' + names[0], 16)

In [71]:
# convert hex values to integers and modulo them
def hashValues(row):
    if row != None:
        return str(int('0x' + row, 16) % 10000)
    else:
        return str(row)

In [72]:
udf_object = F.udf(hashValues)

In [73]:
trainSample.select('c0').collect()[0]

Row(c0='05db9164')

In [74]:
trainSample.withColumn('c0', udf_object(trainSample['c0'])).select('c0').collect()[0]

Row(c0='5684')

In [75]:
for c in trainSample.columns[14:]:
#     newCol = 'new_' + c
    trainSample = trainSample.withColumn(c, udf_object(trainSample[c]))
trainSample = trainSample.cache()

In [76]:
trainSample.head()

Row(label=0.0, n0=0.0, n1=0.0, n2=0.00047080979284369113, n3=0.012658227848101266, n4=0.0045070462648655095, n5=0.01849537615596101, n6=0.0014814814814814814, n7=0.03768844221105527, n8=0.014874371859296482, n9=0.0, n10=0.022727272727272728, n11=0.0, n12=0.005154639175257732, c0='5684', c1='6336', c2='4612', c3='2489', c4='9704', c5='8079', c6='5781', c7='2565', c8='6944', c9='763', c10='4000', c11='4117', c12='9058', c13='2527', c14='5475', c15='460', c16='7684', c17='7627', c18='None', c19='None', c20='5567', c21='None', c22='8905', c23='9305', c24='None', c25='None')

In [77]:
%%time
for c in trainSample.columns[14:]:
    newCol = c + '_idx'
    indexer = StringIndexer(inputCol=c, outputCol=newCol, handleInvalid='keep')
    trainSample = indexer.fit(trainSample).transform(trainSample)
trainSample = trainSample.cache()

CPU times: user 470 ms, sys: 280 ms, total: 750 ms
Wall time: 12.1 s


In [78]:
trainSample.select(trainSample.columns[40:43]).show()

+------+------+------+
|c0_idx|c1_idx|c2_idx|
+------+------+------+
|   0.0|  13.0|   8.0|
|   1.0|  37.0|   1.0|
|   1.0|   0.0|1532.0|
|  13.0|   2.0| 203.0|
|   0.0|   4.0| 919.0|
|   0.0|  11.0|   2.0|
|   0.0|  40.0| 911.0|
|   2.0| 126.0|2269.0|
|  84.0|  21.0|1614.0|
|   2.0|  26.0|1829.0|
|   0.0|  21.0|2381.0|
|   0.0|   0.0| 151.0|
|   1.0|   0.0| 462.0|
|   3.0|   6.0|   4.0|
|   0.0|  14.0|  81.0|
|   0.0|  22.0|  14.0|
|   2.0|   9.0|  25.0|
|   0.0|  23.0|   1.0|
|   0.0|   4.0| 902.0|
|   1.0|   1.0|  72.0|
+------+------+------+
only showing top 20 rows



In [79]:
cols = trainSample.columns[40:]
outputCols = [c.strip('_idx') + '_OHE' for c in cols]
encoder = OneHotEncoderEstimator(inputCols=cols, outputCols=outputCols)
trainSample = encoder.fit(trainSample).transform(trainSample).cache()

In [80]:
trainSample.select(trainSample.columns[40:]).head()

Row(c0_idx=0.0, c1_idx=13.0, c2_idx=8.0, c3_idx=11.0, c4_idx=0.0, c5_idx=0.0, c6_idx=86.0, c7_idx=1.0, c8_idx=0.0, c9_idx=94.0, c10_idx=33.0, c11_idx=12.0, c12_idx=35.0, c13_idx=1.0, c14_idx=37.0, c15_idx=11.0, c16_idx=1.0, c17_idx=20.0, c18_idx=0.0, c19_idx=0.0, c20_idx=10.0, c21_idx=0.0, c22_idx=2.0, c23_idx=8.0, c24_idx=0.0, c25_idx=0.0, c19_OHE=SparseVector(4, {0: 1.0}), c6_OHE=SparseVector(1903, {86: 1.0}), c16_OHE=SparseVector(9, {1: 1.0}), c10_OHE=SparseVector(1347, {33: 1.0}), c14_OHE=SparseVector(1329, {37: 1.0}), c21_OHE=SparseVector(8, {0: 1.0}), c20_OHE=SparseVector(2465, {10: 1.0}), c4_OHE=SparseVector(49, {0: 1.0}), c12_OHE=SparseVector(1167, {35: 1.0}), c25_OHE=SparseVector(1116, {0: 1.0}), c5_OHE=SparseVector(8, {0: 1.0}), c15_OHE=SparseVector(2344, {11: 1.0}), c24_OHE=SparseVector(35, {0: 1.0}), c2_OHE=SparseVector(2646, {8: 1.0}), c8_OHE=SparseVector(3, {0: 1.0}), c22_OHE=SparseVector(13, {2: 1.0}), c13_OHE=SparseVector(22, {1: 1.0}), c1_OHE=SparseVector(330, {13: 1.0

In [81]:
cols = [c for c in trainSample.columns if 'n' in c or 'OHE' in c]

In [82]:
v = VectorAssembler(inputCols=cols, outputCol="features")
trainSample = v.transform(trainSample)

In [83]:
trainSample.columns[-1]

'features'

In [85]:
trainSample.select('features').head(5)

[Row(features=SparseVector(24008, {2: 0.0005, 3: 0.0127, 4: 0.0045, 5: 0.0185, 6: 0.0015, 7: 0.0377, 8: 0.0149, 10: 0.0227, 12: 0.0052, 13: 1.0, 103: 1.0, 1921: 1.0, 1962: 1.0, 3313: 1.0, 4605: 1.0, 4623: 1.0, 7078: 1.0, 7162: 1.0, 8294: 1.0, 9410: 1.0, 9429: 1.0, 11762: 1.0, 11805: 1.0, 14443: 1.0, 14448: 1.0, 14460: 1.0, 14494: 1.0, 14823: 1.0, 17333: 1.0, 18154: 1.0, 18494: 1.0, 18618: 1.0, 20181: 1.0, 21848: 1.0, 21933: 1.0})),
 Row(features=SparseVector(24008, {1: 0.0003, 4: 0.0024, 7: 0.0528, 8: 0.0072, 13: 1.0, 657: 1.0, 1925: 1.0, 3103: 1.0, 3827: 1.0, 4606: 1.0, 4614: 1.0, 7078: 1.0, 7613: 1.0, 8294: 1.0, 9410: 1.0, 9419: 1.0, 11762: 1.0, 11798: 1.0, 14443: 1.0, 14450: 1.0, 14460: 1.0, 14518: 1.0, 14812: 1.0, 17546: 1.0, 18154: 1.0, 18495: 1.0, 18612: 1.0, 20087: 1.0, 21847: 1.0, 21922: 1.0})),
 Row(features=SparseVector(24008, {1: 0.0003, 4: 0.0092, 6: 0.0133, 8: 0.0665, 10: 0.0341, 13: 1.0, 906: 1.0, 1924: 1.0, 3025: 1.0, 3925: 1.0, 4605: 1.0, 6041: 1.0, 7078: 1.0, 7476: 1.0

In [86]:
trainSample.write.format('parquet').save('data/trainSample.parquet')