# Final Project - Pre-Processing

In [1]:
import re
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import types, Row, Column
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
# read in raw text set and write to parquet
# train = spark.read.option('header', 'false').csv('data/train.txt', sep='\t')
# train.write.format('parquet').save('data/train.parquet')

In [3]:
# read in parqet
train = spark.read.parquet('data/train.parquet')

In [4]:
# rename label column
train = train.withColumnRenamed('_c0', 'label')

# remove underscores
for c in train.columns[1:]:
    train = train.withColumnRenamed(c, c.strip('_'))

for i,c in enumerate(train.columns[1:14]):
    newName = 'n' + str(i)
    train = train.withColumnRenamed(c, newName)
    
for i,c in enumerate(train.columns[14:]):
    newName = 'c' + str(i)
    train = train.withColumnRenamed(c, newName)

train.columns

['label',
 'n0',
 'n1',
 'n2',
 'n3',
 'n4',
 'n5',
 'n6',
 'n7',
 'n8',
 'n9',
 'n10',
 'n11',
 'n12',
 'c0',
 'c1',
 'c2',
 'c3',
 'c4',
 'c5',
 'c6',
 'c7',
 'c8',
 'c9',
 'c10',
 'c11',
 'c12',
 'c13',
 'c14',
 'c15',
 'c16',
 'c17',
 'c18',
 'c19',
 'c20',
 'c21',
 'c22',
 'c23',
 'c24',
 'c25']

In [5]:
# cast numerical is float
for c in train.columns[:14]:
    train = train.withColumn(c, train[c].cast('float'))
train.printSchema()

root
 |-- label: float (nullable = true)
 |-- n0: float (nullable = true)
 |-- n1: float (nullable = true)
 |-- n2: float (nullable = true)
 |-- n3: float (nullable = true)
 |-- n4: float (nullable = true)
 |-- n5: float (nullable = true)
 |-- n6: float (nullable = true)
 |-- n7: float (nullable = true)
 |-- n8: float (nullable = true)
 |-- n9: float (nullable = true)
 |-- n10: float (nullable = true)
 |-- n11: float (nullable = true)
 |-- n12: float (nullable = true)
 |-- c0: string (nullable = true)
 |-- c1: string (nullable = true)
 |-- c2: string (nullable = true)
 |-- c3: string (nullable = true)
 |-- c4: string (nullable = true)
 |-- c5: string (nullable = true)
 |-- c6: string (nullable = true)
 |-- c7: string (nullable = true)
 |-- c8: string (nullable = true)
 |-- c9: string (nullable = true)
 |-- c10: string (nullable = true)
 |-- c11: string (nullable = true)
 |-- c12: string (nullable = true)
 |-- c13: string (nullable = true)
 |-- c14: string (nullable = true)
 |-- c15: st

In [6]:
# grab a sample
s = train.sample(False, 0.001)
s.count()

45601

In [7]:
s.head()

Row(label=1.0, n0=None, n1=20.0, n2=70.0, n3=26.0, n4=4.0, n5=None, n6=0.0, n7=25.0, n8=26.0, n9=None, n10=0.0, n11=None, n12=26.0, c0='68fd1e64', c1='b80912da', c2='45d73e26', c3='4e2622db', c4='4cf72387', c5='7e0ccccf', c6='635cdf1c', c7='5b392875', c8='a73ee510', c9='3b08e48b', c10='099f88a2', c11='59df9ec7', c12='5bc02dfd', c13='07d13a8f', c14='569913cf', c15='6cd2843a', c16='776ce399', c17='7119e567', c18='19050a44', c19='a458ea53', c20='a6d51873', c21=None, c22='3a171ecb', c23='0dfdc7d2', c24='e8b83407', c25='def102a7')

# Train/Test Split

In [8]:
trainSample, testSample = s.randomSplit([1.0, 9.0], 666)
trainSample = trainSample.cache()
testSample = testSample.cache()

In [9]:
trainSample.count(), testSample.count()

(4652, 40949)

# Normalize numerical data

In [10]:
# %%time
# stats = trainSample[trainSample.columns[1:14]].describe()
# maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
# mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
# maxes = [float(m) for m in maxes]
# mins = [float(m) for m in mins]

CPU times: user 30 ms, sys: 30 ms, total: 60 ms
Wall time: 4.85 s


In [59]:
# # normalize columns
# for i,c in enumerate(trainSample.columns[1:14]):
#     trainSample = trainSample.withColumn(c, (trainSample[c] - mins[i]) / (maxes[i] - mins[i]))

In [60]:
# trainSample.select(trainSample.columns[1:14]).head()

Row(n0=None, n1=0.0, n2=0.00047080979284369113, n3=0.012658227848101266, n4=0.0045070462648655095, n5=0.01849537615596101, n6=0.0014814814814814814, n7=0.03768844221105527, n8=0.014874371859296482, n9=None, n10=0.022727272727272728, n11=None, n12=0.005154639175257732)

In [61]:
# # need to decide on proper NA handling later on, for now filling with 0
# trainSample = trainSample.na.fill(0, subset=trainSample.columns[1:14])

In [62]:
# trainSample.select(trainSample.columns[1:14]).head()

Row(n0=0.0, n1=0.0, n2=0.00047080979284369113, n3=0.012658227848101266, n4=0.0045070462648655095, n5=0.01849537615596101, n6=0.0014814814814814814, n7=0.03768844221105527, n8=0.014874371859296482, n9=0.0, n10=0.022727272727272728, n11=0.0, n12=0.005154639175257732)

In [10]:
def normalizeNumeric(trainDf, testDf):
    stats = trainDf[trainDf.columns[1:14]].describe()
    maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
    mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
    maxes = [float(m) for m in maxes]
    mins = [float(m) for m in mins]
    
    for i,c in enumerate(trainDf.columns[1:14]):
        trainDf = trainDf.withColumn(c, (trainDf[c] - mins[i]) / (maxes[i] - mins[i]))
        testDf = testDf.withColumn(c, (testDf[c] - mins[i]) / (maxes[i] - mins[i]))
        
    # NEED TO FIGURE THIS OUT FIRST
    trainDf = trainDf.na.fill(0, subset=trainDf.columns[1:14])
    testDf = testDf.na.fill(0, subset=testDf.columns[1:14])
    
    return trainDf.cache(), testDf.cache()

In [11]:
trainSample, testSample = normalizeNumeric(trainSample, testSample)

In [12]:
trainSample.head()

Row(label=0.0, n0=0.0, n1=0.00016105653084232566, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='5bfa8ab5', c1='e6203a55', c2='a406a722', c3='f0249dbe', c4='25c83c98', c5='6f6d9be8', c6='3d4f5cb7', c7='0b153874', c8='7cc72ec2', c9='3b08e48b', c10='53be0d4b', c11='3e76d3fa', c12='8803181f', c13='07d13a8f', c14='c7cb28b6', c15='d089fb6d', c16='2005abd1', c17='2e39068c', c18=None, c19=None, c20='4095765f', c21=None, c22='32c7478e', c23='5414e525', c24=None, c25=None)

In [13]:
testSample.head()

Row(label=0.0, n0=0.0, n1=0.00016105653084232566, n2=0.0, n3=0.0, n4=0.0, n5=0.0, n6=0.0, n7=0.0, n8=0.0, n9=0.0, n10=0.0, n11=0.0, n12=0.0, c0='68fd1e64', c1='61e10608', c2='fae1bc10', c3='28d926b8', c4='25c83c98', c5='fe6b92e5', c6='b5b7ad43', c7='0b153874', c8='7cc72ec2', c9='3b08e48b', c10='364bff6f', c11='9576d76f', c12='31d2ac00', c13='07d13a8f', c14='b431bed8', c15='fc53f85c', c16='2005abd1', c17='a6af02f7', c18=None, c19=None, c20='ed35ed93', c21=None, c22='be7c41b4', c23='4fcc135f', c24=None, c25=None)

# Categorical manipulation

In [63]:
# counts = trainSample.select(trainSample.columns[14:]).summary('count')

In [64]:
# c = trainSample.groupBy('c0').count()
# c.collect()[0]
# # type(c)
# # c = c.orderBy(c.count.desc()).collect()

In [65]:
# def findInfrequentValues(c, n=10):
#     # c is the column that we are operating on
#     # 
#     counts = trainSample.groupBy(c).count()
#     infrequentValues = counts.filter(counts['count'] <= n)
#     s = infrequentValues.agg(F.collect_set(c)).collect()[0][0]
#     return s

In [66]:
# names = replaceInfrequentValues('c0')
# type(names)

In [67]:
# df = trainSample.withColumn('c0', F.when(trainSample['c0'] == names[0], '999').otherwise(trainSample['c0']))

In [68]:
# df.select(df['c0'] == names[0]).collect()

In [69]:
# names[0]

In [70]:
# int('0x' + names[0], 16)

In [15]:
# # convert hex values to integers and modulo them
# def hashValues(row):
#     if row != None:
#         return str(int('0x' + row, 16) % 10000)
#     else:
#         return str(row)

In [16]:
# udf_object = F.udf(hashValues)

In [None]:
# trainSample.select('c0').collect()[0]

In [None]:
# trainSample.withColumn('c0', udf_object(trainSample['c0'])).select('c0').collect()[0]

In [17]:
# for c in trainSample.columns[14:]:
#     trainSample = trainSample.withColumn(c, udf_object(trainSample[c]))
#     testSample = testSample.withColumn(c, udf_object(testSample[c]))

    
# # trainSample = trainSample.cache()

In [None]:
# trainSample.head()

In [None]:
# testSample.head()

In [None]:
# %%time
# for c in trainSample.columns[14:]:
#     newCol = c + '_idx'
#     indexer = StringIndexer(inputCol=c, outputCol=newCol, handleInvalid='keep')
# #     trainSample = indexer.fit(trainSample).transform(trainSample)
#     sIdx = indexer.fit(trainSample)
#     trainSample = sIdx.transform(trainSample)
#     testSample = sIdx.transform(testSample)
# trainSample = trainSample.cache()
# testSample = testSample.cache()

In [None]:
# trainSample.select(trainSample.columns[40:]).show(2)

In [None]:
# testSample.select(testSample.columns[40:]).show(2)

In [26]:
# cols = trainSample.columns[40:]
# outputCols = [c.strip('_idx') + '_OHE' for c in cols]
# encoder = OneHotEncoderEstimator(inputCols=cols, outputCols=outputCols)
# OHE = encoder.fit(trainSample)
# trainSample = OHE.transform(trainSample)
# testSample = OHE.transform(testSample)

In [None]:
# trainSample.select(trainSample.columns[-26:]).show(2)

In [None]:
# testSample.select(testSample.columns[-26:]).show(2)

In [29]:
# cols = [c for c in trainSample.columns if 'n' in c or 'OHE' in c]

In [30]:
# v = VectorAssembler(inputCols=cols, outputCol="features")
# trainSample = v.transform(trainSample)
# testSample = v.transform(testSample)

In [None]:
# trainSample.columns[-1], testSample.columns[-1]

In [None]:
# trainSample.select('features').head()

In [None]:
# testSample.select('features').head()

In [14]:
def createFeatureVector(trainDf, testDf):
    # create hash function for binning categorical variables
    def hashValues(row):
        if row != None:
            # return integer value of hex label, modulo by 10000 (keep only the last 4 digits)
            return str(int('0x' + row, 16) % 10000)
        else:
            return str(row)
    
    # create the udf object from the helper function
    udf_object = F.udf(hashValues)
    
    # hash all hex strings in both train and test
    for c in trainDf.columns[14:]:
        trainDf = trainDf.withColumn(c, udf_object(trainDf[c]))
        testDf = testDf.withColumn(c, udf_object(testDf[c]))
        
    # index the hash values into categories
    for c in trainDf.columns[14:]:
        newCol = c + '_idx'
        indexer = StringIndexer(inputCol=c, outputCol=newCol, handleInvalid='keep')
        f = indexer.fit(trainDf)
        trainDf = f.transform(trainDf)
        testDf = f.transform(testDf)
        
    # One-hot encode the categorical indices
    inputCols = trainDf.columns[40:]
    outputCols = [c.strip('_idx') + '_OHE' for c in inputCols]
    encoder = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols)
    e = encoder.fit(trainDf)
    trainDf = e.transform(trainDf)
    testDf = e.transform(testDf)
    
    # assemble all features into single SparseVector column
    cols = [c for c in trainDf.columns if 'n' in c or 'OHE' in c]
    v = VectorAssembler(inputCols=cols, outputCol="features")
    trainDf = v.transform(trainDf)
    testDf = v.transform(testDf)
    
    return trainDf.cache(), testDf.cache()

In [15]:
trainSample, testSample = createFeatureVector(trainSample, testSample)

In [16]:
trainSample.columns[-1], testSample.columns[-1]

('features', 'features')

In [17]:
trainSample.select('features').head()

Row(features=SparseVector(23784, {1: 0.0002, 13: 1.0, 405: 1.0, 1948: 1.0, 2246: 1.0, 3931: 1.0, 4658: 1.0, 7032: 1.0, 7041: 1.0, 7111: 1.0, 8308: 1.0, 9412: 1.0, 10938: 1.0, 11740: 1.0, 12113: 1.0, 14356: 1.0, 14358: 1.0, 14372: 1.0, 14539: 1.0, 15176: 1.0, 17548: 1.0, 18032: 1.0, 18383: 1.0, 19675: 1.0, 19910: 1.0, 21666: 1.0, 22429: 1.0}))

In [18]:
testSample.select('features').head()

Row(features=SparseVector(23784, {1: 0.0002, 13: 1.0, 910: 1.0, 1948: 1.0, 2474: 1.0, 3826: 1.0, 4658: 1.0, 4695: 1.0, 7041: 1.0, 7381: 1.0, 8308: 1.0, 9409: 1.0, 9451: 1.0, 11740: 1.0, 12159: 1.0, 14356: 1.0, 14363: 1.0, 14372: 1.0, 14557: 1.0, 15278: 1.0, 17652: 1.0, 18032: 1.0, 18379: 1.0, 18514: 1.0, 19910: 1.0, 21666: 1.0, 21764: 1.0}))

In [86]:
trainSample.write.format('parquet').save('data/trainSample.parquet')