# Final Project - Pre-Processing

In [1]:
import re
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import types, Row, Column
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
# read in raw text set and write to parquet
# train = spark.read.option('header', 'false').csv('data/train.txt', sep='\t')
# train.write.format('parquet').save('data/train.parquet')

In [3]:
# read in parqet
train = spark.read.parquet('data/train.parquet')

In [4]:
# rename label column
train = train.withColumnRenamed('_c0', 'label')

# remove underscores
for c in train.columns[1:]:
    train = train.withColumnRenamed(c, c.strip('_'))

for i,c in enumerate(train.columns[1:14]):
    newName = 'n' + str(i)
    train = train.withColumnRenamed(c, newName)
    
for i,c in enumerate(train.columns[14:]):
    newName = 'c' + str(i)
    train = train.withColumnRenamed(c, newName)

train.columns

['label',
 'n0',
 'n1',
 'n2',
 'n3',
 'n4',
 'n5',
 'n6',
 'n7',
 'n8',
 'n9',
 'n10',
 'n11',
 'n12',
 'c0',
 'c1',
 'c2',
 'c3',
 'c4',
 'c5',
 'c6',
 'c7',
 'c8',
 'c9',
 'c10',
 'c11',
 'c12',
 'c13',
 'c14',
 'c15',
 'c16',
 'c17',
 'c18',
 'c19',
 'c20',
 'c21',
 'c22',
 'c23',
 'c24',
 'c25']

In [5]:
# cast numerical is float
for c in train.columns[:14]:
    train = train.withColumn(c, train[c].cast('float'))
train.printSchema()

root
 |-- label: float (nullable = true)
 |-- n0: float (nullable = true)
 |-- n1: float (nullable = true)
 |-- n2: float (nullable = true)
 |-- n3: float (nullable = true)
 |-- n4: float (nullable = true)
 |-- n5: float (nullable = true)
 |-- n6: float (nullable = true)
 |-- n7: float (nullable = true)
 |-- n8: float (nullable = true)
 |-- n9: float (nullable = true)
 |-- n10: float (nullable = true)
 |-- n11: float (nullable = true)
 |-- n12: float (nullable = true)
 |-- c0: string (nullable = true)
 |-- c1: string (nullable = true)
 |-- c2: string (nullable = true)
 |-- c3: string (nullable = true)
 |-- c4: string (nullable = true)
 |-- c5: string (nullable = true)
 |-- c6: string (nullable = true)
 |-- c7: string (nullable = true)
 |-- c8: string (nullable = true)
 |-- c9: string (nullable = true)
 |-- c10: string (nullable = true)
 |-- c11: string (nullable = true)
 |-- c12: string (nullable = true)
 |-- c13: string (nullable = true)
 |-- c14: string (nullable = true)
 |-- c15: st

In [6]:
# grab a sample
s = train.sample(False, 0.001)
s.count()

46025

In [7]:
s.head()

Row(label=0.0, n0=1.0, n1=11.0, n2=3.0, n3=3.0, n4=201.0, n5=3.0, n6=1.0, n7=3.0, n8=3.0, n9=1.0, n10=1.0, n11=None, n12=3.0, c0='05db9164', c1='207b2d81', c2='d9e96c5e', c3='0fa0d423', c4='25c83c98', c5='fbad5c96', c6='9c8ed289', c7='0b153874', c8='a73ee510', c9='dc650390', c10='a7b606c4', c11='8c51950e', c12='eae197fd', c13='cfef1c29', c14='f2237c1b', c15='6bb29970', c16='d4bb7bd8', c17='99cb5912', c18='21ddcdc9', c19='b1252a9d', c20='8f18d8d5', c21=None, c22='3a171ecb', c23='8fc66e78', c24='001f3601', c25='f37f3967')

# Train/Test Split

In [8]:
# on sample
trainSample, testSample = s.randomSplit([1.0, 9.0], 666)
trainSample = trainSample.cache()
testSample = testSample.cache()

In [9]:
trainSample.count(), testSample.count()

(4639, 40805)

In [6]:
# on full
train, test = train.randomSplit([1.0, 9.0], 666)
train = train.cache()
test = test.cache()

In [None]:
%%time
train.count(), test.count()

# Normalize numerical data

In [None]:
# %%time
# stats = trainSample[trainSample.columns[1:14]].describe()
# maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
# mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
# maxes = [float(m) for m in maxes]
# mins = [float(m) for m in mins]

In [59]:
# # normalize columns
# for i,c in enumerate(trainSample.columns[1:14]):
#     trainSample = trainSample.withColumn(c, (trainSample[c] - mins[i]) / (maxes[i] - mins[i]))

In [None]:
# trainSample.select(trainSample.columns[1:14]).head()

In [61]:
# # need to decide on proper NA handling later on, for now filling with 0
# trainSample = trainSample.na.fill(0, subset=trainSample.columns[1:14])

In [None]:
# trainSample.select(trainSample.columns[1:14]).head()

In [9]:
def normalizeNumeric(trainDf, testDf):
    stats = trainDf[trainDf.columns[1:14]].describe()
    maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
    mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
    maxes = [float(m) for m in maxes]
    mins = [float(m) for m in mins]
    
    for i,c in enumerate(trainDf.columns[1:14]):
        trainDf = trainDf.withColumn(c, (trainDf[c] - mins[i]) / (maxes[i] - mins[i]))
        testDf = testDf.withColumn(c, (testDf[c] - mins[i]) / (maxes[i] - mins[i]))
        
    # NEED TO FIGURE THIS OUT FIRST
    trainDf = trainDf.na.fill(0, subset=trainDf.columns[1:14])
    testDf = testDf.na.fill(0, subset=testDf.columns[1:14])
    
    return trainDf.cache(), testDf.cache()

## on sample

In [10]:
# on sample
trainSample, testSample = normalizeNumeric(trainSample, testSample)

In [11]:
trainSample.head()

Row(label=0.0, n0=0.0, n1=0.0001520912547528517, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='05db9164', c1='38a947a1', c2='0e4f7fa4', c3='5deac079', c4='25c83c98', c5='7e0ccccf', c6='bca35340', c7='0b153874', c8='a73ee510', c9='3b08e48b', c10='bbf9c1a0', c11='1f6b8745', c12='a3fda569', c13='07d13a8f', c14='6c3ae388', c15='50d2b997', c16='2005abd1', c17='a2157fe3', c18=None, c19=None, c20='9c03188d', c21=None, c22='be7c41b4', c23='4b871c6a', c24=None, c25=None)

In [12]:
testSample.head()

Row(label=0.0, n0=0.0, n1=0.0001520912547528517, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='75ac2fe6', c1='38a947a1', c2=None, c3=None, c4='25c83c98', c5='7e0ccccf', c6='ba5d463e', c7='0b153874', c8='7cc72ec2', c9='3b08e48b', c10='2db82d51', c11=None, c12='6d41cadf', c13='b28479f6', c14='992454dc', c15=None, c16='2005abd1', c17='9b82aca5', c18=None, c19=None, c20=None, c21='ad3062eb', c22='32c7478e', c23=None, c24=None, c25=None)

## on full

In [9]:
%%time
train, test = normalizeNumeric(train, test)

CPU times: user 160 ms, sys: 140 ms, total: 300 ms
Wall time: 7min 8s


In [10]:
train.head()

Row(label=0.0, n0=0.0, n1=3.880812486902258e-06, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='05db9164', c1='31eb7ac1', c2='2dc2b523', c3='137a5e26', c4='25c83c98', c5='fe6b92e5', c6='ce8217f8', c7='5b392875', c8='7cc72ec2', c9='3b08e48b', c10='9d12ce9b', c11='d9510218', c12='9dfda2b9', c13='07d13a8f', c14='6cb56b0f', c15='57ddd4e0', c16='2005abd1', c17='e3f6ec41', c18=None, c19=None, c20='45458c05', c21=None, c22='be7c41b4', c23='1793a828', c24=None, c25=None)

In [None]:
test.head()

# Categorical manipulation

In [63]:
# counts = trainSample.select(trainSample.columns[14:]).summary('count')

In [64]:
# c = trainSample.groupBy('c0').count()
# c.collect()[0]
# # type(c)
# # c = c.orderBy(c.count.desc()).collect()

In [65]:
# def findInfrequentValues(c, n=10):
#     # c is the column that we are operating on
#     # 
#     counts = trainSample.groupBy(c).count()
#     infrequentValues = counts.filter(counts['count'] <= n)
#     s = infrequentValues.agg(F.collect_set(c)).collect()[0][0]
#     return s

In [66]:
# names = replaceInfrequentValues('c0')
# type(names)

In [67]:
# df = trainSample.withColumn('c0', F.when(trainSample['c0'] == names[0], '999').otherwise(trainSample['c0']))

In [68]:
# df.select(df['c0'] == names[0]).collect()

In [69]:
# names[0]

In [70]:
# int('0x' + names[0], 16)

In [15]:
# # convert hex values to integers and modulo them
# def hashValues(row):
#     if row != None:
#         return str(int('0x' + row, 16) % 10000)
#     else:
#         return str(row)

In [16]:
# udf_object = F.udf(hashValues)

In [None]:
# trainSample.select('c0').collect()[0]

In [None]:
# trainSample.withColumn('c0', udf_object(trainSample['c0'])).select('c0').collect()[0]

In [17]:
# for c in trainSample.columns[14:]:
#     trainSample = trainSample.withColumn(c, udf_object(trainSample[c]))
#     testSample = testSample.withColumn(c, udf_object(testSample[c]))

    
# # trainSample = trainSample.cache()

In [None]:
# trainSample.head()

In [None]:
# testSample.head()

In [None]:
# %%time
# for c in trainSample.columns[14:]:
#     newCol = c + '_idx'
#     indexer = StringIndexer(inputCol=c, outputCol=newCol, handleInvalid='keep')
# #     trainSample = indexer.fit(trainSample).transform(trainSample)
#     sIdx = indexer.fit(trainSample)
#     trainSample = sIdx.transform(trainSample)
#     testSample = sIdx.transform(testSample)
# trainSample = trainSample.cache()
# testSample = testSample.cache()

In [None]:
# trainSample.select(trainSample.columns[40:]).show(2)

In [None]:
# testSample.select(testSample.columns[40:]).show(2)

In [26]:
# cols = trainSample.columns[40:]
# outputCols = [c.strip('_idx') + '_OHE' for c in cols]
# encoder = OneHotEncoderEstimator(inputCols=cols, outputCols=outputCols)
# OHE = encoder.fit(trainSample)
# trainSample = OHE.transform(trainSample)
# testSample = OHE.transform(testSample)

In [None]:
# trainSample.select(trainSample.columns[-26:]).show(2)

In [None]:
# testSample.select(testSample.columns[-26:]).show(2)

In [29]:
# cols = [c for c in trainSample.columns if 'n' in c or 'OHE' in c]

In [30]:
# v = VectorAssembler(inputCols=cols, outputCol="features")
# trainSample = v.transform(trainSample)
# testSample = v.transform(testSample)

In [None]:
# trainSample.columns[-1], testSample.columns[-1]

In [None]:
# trainSample.select('features').head()

In [None]:
# testSample.select('features').head()

In [13]:
def createFeatureVector(trainDf, testDf):
    # create hash function for binning categorical variables
    def hashValues(row):
        if row != None:
            # return integer value of hex label, modulo by 10000 (keep only the last 4 digits)
            return str(int('0x' + row, 16) % 10000)
        else:
            return str(row)
    
    # create the udf object from the helper function
    udf_object = F.udf(hashValues)
    
    # hash all hex strings in both train and test
    for c in trainDf.columns[14:]:
        trainDf = trainDf.withColumn(c, udf_object(trainDf[c]))
        testDf = testDf.withColumn(c, udf_object(testDf[c]))
        
    # index the hash values into categories
    for c in trainDf.columns[14:]:
        newCol = c + '_idx'
        indexer = StringIndexer(inputCol=c, outputCol=newCol, handleInvalid='keep')
        f = indexer.fit(trainDf)
        trainDf = f.transform(trainDf)
        testDf = f.transform(testDf)
        
    # One-hot encode the categorical indices
    inputCols = trainDf.columns[40:]
    outputCols = [c.strip('_idx') + '_OHE' for c in inputCols]
    encoder = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols)
    e = encoder.fit(trainDf)
    trainDf = e.transform(trainDf)
    testDf = e.transform(testDf)
    
    # assemble all features into single SparseVector column
    cols = [c for c in trainDf.columns if 'n' in c or 'OHE' in c]
    v = VectorAssembler(inputCols=cols, outputCol="features")
    trainDf = v.transform(trainDf)
    testDf = v.transform(testDf)
    
    return trainDf.cache(), testDf.cache()

## on sample

In [14]:
# on sample
trainSample, testSample = createFeatureVector(trainSample, testSample)

In [15]:
trainSample.columns[-1], testSample.columns[-1]

('features', 'features')

In [16]:
trainSample.select('features').head()

Row(features=SparseVector(24135, {1: 0.0002, 13: 1.0, 448: 1.0, 1975: 1.0, 2180: 1.0, 3940: 1.0, 4686: 1.0, 5136: 1.0, 7158: 1.0, 7531: 1.0, 8424: 1.0, 9516: 1.0, 9713: 1.0, 11893: 1.0, 12462: 1.0, 14541: 1.0, 14548: 1.0, 14558: 1.0, 14580: 1.0, 15411: 1.0, 17808: 1.0, 18259: 1.0, 18598: 1.0, 18929: 1.0, 20167: 1.0, 21945: 1.0, 22148: 1.0}))

In [17]:
testSample.select('features').head()

Row(features=SparseVector(24135, {1: 0.0002, 13: 1.0, 711: 1.0, 1975: 1.0, 4687: 1.0, 4694: 1.0, 7158: 1.0, 7675: 1.0, 8424: 1.0, 9516: 1.0, 9523: 1.0, 11893: 1.0, 11929: 1.0, 14542: 1.0, 14544: 1.0, 14557: 1.0, 14580: 1.0, 14909: 1.0, 17753: 1.0, 18259: 1.0, 18610: 1.0, 18714: 1.0, 20167: 1.0, 21945: 1.0, 22017: 1.0}))

In [19]:
trainSample.write.format('parquet').save('data/trainSample.parquet')
testSample.write.format('parquet').save('data/testSample.parquet')

## on full

In [13]:
%%time
# on full
train, test = createFeatureVector(train, test)

CPU times: user 1.34 s, sys: 470 ms, total: 1.81 s
Wall time: 9min 13s


In [14]:
train.select('features').head()

Row(features=SparseVector(109465, {1: 0.0, 13: 1.0, 92: 1.0, 7000: 1.0, 7117: 1.0, 12260: 1.0, 18098: 1.0, 21277: 1.0, 28116: 1.0, 28503: 1.0, 31171: 1.0, 41156: 1.0, 47668: 1.0, 51174: 1.0, 58617: 1.0, 61259: 1.0, 61265: 1.0, 61277: 1.0, 61548: 1.0, 66279: 1.0, 72302: 1.0, 75696: 1.0, 77570: 1.0, 78910: 1.0, 88906: 1.0, 98860: 1.0, 102345: 1.0}))

In [None]:
test.select('features').head()

In [None]:
# train.write.format('parquet').save('data/trainPreProcessed.parquet')
# test.write.format('parquet').save('data/devPreProcessed.parquet')

# pipeline implementation

In [38]:
def hashCategoricals(trainDf, testDf):
    # create hash function for binning categorical variables
    def hashValues(row):
        if row != None:
            # return integer value of hex label, modulo by 10000 (keep only the last 4 digits)
            return str(int('0x' + row, 16) % 10000)
        else:
            return str(row)
    
    # create the udf object from the helper function
    udf_object = F.udf(hashValues)
    
    # hash all hex strings in both train and test
    for c in trainDf.columns[14:]:
        trainDf = trainDf.withColumn(c, udf_object(trainDf[c]))
        testDf = testDf.withColumn(c, udf_object(testDf[c]))
    
    return trainDf.cache(), testDf.cache()

In [39]:
trainSample, testSample = hashCategoricals(trainSample, testSample)

In [40]:
trainSample.head()

Row(label=0.0, n0=0.0, n1=0.0002068680182043856, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='8852', c1='8017', c2='4310', c3='8313', c4='4343', c5='2821', c6='3090', c7='84', c8='8418', c9='8539', c10='2816', c11='6109', c12='1890', c13='6422', c14='4146', c15='4642', c16='2577', c17='8733', c18='None', c19='None', c20='5227', c21='9419', c22='5356', c23='1146', c24='None', c25='None')

In [41]:
testSample.head()

Row(label=0.0, n0=0.0, n1=0.0002068680182043856, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='5684', c1='8017', c2='9656', c3='9545', c4='9704', c5='8079', c6='3090', c7='84', c8='8418', c9='8539', c10='2816', c11='1488', c12='1890', c13='2527', c14='5555', c15='7942', c16='2577', c17='7231', c18='None', c19='None', c20='1497', c21='None', c22='3739', c23='1592', c24='None', c25='None')

In [52]:
# pipeline implementation
def createFeatureVector2(trainDf, testDf):
    # generate stages for pipeline
    stages = []
    
    # create indexer to hash values into categories
    for c in trainDf.columns[14:]:
        strIdxCol = c + '_idx'
        oheCol = strIdxCol.strip('_idx') + '_OHE'
        indexer = StringIndexer(inputCol=c, outputCol=strIdxCol, handleInvalid='keep')
        OHE = OneHotEncoderEstimator(inputCols=strIdxCol, outputCols=oheCol, dropLast=False)
        stages += [indexer, OHE]
        
    # One-hot encode the categorical indices
#     inputCols = trainDf.columns[40:]
#     outputCols = [c.strip('_idx') + '_OHE' for c in inputCols]
#     encoder = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols, dropLast=False)
#     stages += [encoder]
#     print(stages)
#     e = encoder.fit(trainDf)
#     trainDf = e.transform(trainDf)
#     testDf = e.transform(testDf)
    
    # assemble all features into single SparseVector column
#     cols = [c for c in trainDf.columns if 'n' in c or 'OHE' in c]
#     v = VectorAssembler(inputCols=cols, outputCol="features")
#     stages += [v]
#     trainDf = v.transform(trainDf)
#     testDf = v.transform(testDf)
    
    pipe = Pipeline(stages=stages)
    model = pipe.fit(trainDf)
    trainDf = model.transform(trainDf)
    testDf = model.transform(testDf)
    
    return trainDf.cache(), testDf.cache()

In [54]:
trainSample.head()

Row(label=0.0, n0=0.0, n1=0.0002068680182043856, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='8852', c1='8017', c2='4310', c3='8313', c4='4343', c5='2821', c6='3090', c7='84', c8='8418', c9='8539', c10='2816', c11='6109', c12='1890', c13='6422', c14='4146', c15='4642', c16='2577', c17='8733', c18='None', c19='None', c20='5227', c21='9419', c22='5356', c23='1146', c24='None', c25='None')

In [None]:
trainer, tester = createFeatureVector2(trainSample, testSample)

In [24]:
trainer.columns[-1], tester.columns[-1]

('features', 'features')

In [48]:
trainer.head()

Row(label=0.0, n0=0.0, n1=0.0002068680182043856, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='8852', c1='8017', c2='4310', c3='8313', c4='4343', c5='2821', c6='3090', c7='84', c8='8418', c9='8539', c10='2816', c11='6109', c12='1890', c13='6422', c14='4146', c15='4642', c16='2577', c17='8733', c18='None', c19='None', c20='5227', c21='9419', c22='5356', c23='1146', c24='None', c25='None', c0_idx=1.0, c1_idx=0.0, c2_idx=2385.0, c3_idx=1150.0, c4_idx=6.0, c5_idx=2.0, c6_idx=68.0, c7_idx=0.0, c8_idx=1.0, c9_idx=0.0, c10_idx=97.0, c11_idx=231.0, c12_idx=12.0, c13_idx=0.0, c14_idx=969.0, c15_idx=1788.0, c16_idx=8.0, c17_idx=811.0, c18_idx=0.0, c19_idx=0.0, c20_idx=2235.0, c21_idx=1.0, c22_idx=3.0, c23_idx=361.0, c24_idx=0.0, c25_idx=0.0)

In [49]:
tester.head()

Row(label=0.0, n0=0.0, n1=0.0002068680182043856, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='5684', c1='8017', c2='9656', c3='9545', c4='9704', c5='8079', c6='3090', c7='84', c8='8418', c9='8539', c10='2816', c11='1488', c12='1890', c13='2527', c14='5555', c15='7942', c16='2577', c17='7231', c18='None', c19='None', c20='1497', c21='None', c22='3739', c23='1592', c24='None', c25='None', c0_idx=0.0, c1_idx=0.0, c2_idx=140.0, c3_idx=7.0, c4_idx=0.0, c5_idx=0.0, c6_idx=68.0, c7_idx=0.0, c8_idx=1.0, c9_idx=0.0, c10_idx=97.0, c11_idx=120.0, c12_idx=12.0, c13_idx=1.0, c14_idx=149.0, c15_idx=7.0, c16_idx=8.0, c17_idx=43.0, c18_idx=0.0, c19_idx=0.0, c20_idx=7.0, c21_idx=0.0, c22_idx=1.0, c23_idx=1.0, c24_idx=0.0, c25_idx=0.0)

In [86]:
trainSample.write.format('parquet').save('data/trainSample.parquet')