# Final Project

In [1]:
import re
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import types, Row, Column
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
# read in raw text set and write to parquet
# train = spark.read.option('header', 'false').csv('data/train.txt', sep='\t')
# train.write.format('parquet').save('data/train.parquet')

In [3]:
# read in parqet
train = spark.read.parquet('data/train.parquet')

In [4]:
# rename label column
train = train.withColumnRenamed('_c0', 'label')

# remove underscores
for c in train.columns[1:]:
    train = train.withColumnRenamed(c, c.strip('_'))

for i,c in enumerate(train.columns[1:14]):
    newName = 'n' + str(i)
    train = train.withColumnRenamed(c, newName)
    
for i,c in enumerate(train.columns[14:]):
    newName = 'c' + str(i)
    train = train.withColumnRenamed(c, newName)

train.columns

['label',
 'n0',
 'n1',
 'n2',
 'n3',
 'n4',
 'n5',
 'n6',
 'n7',
 'n8',
 'n9',
 'n10',
 'n11',
 'n12',
 'c0',
 'c1',
 'c2',
 'c3',
 'c4',
 'c5',
 'c6',
 'c7',
 'c8',
 'c9',
 'c10',
 'c11',
 'c12',
 'c13',
 'c14',
 'c15',
 'c16',
 'c17',
 'c18',
 'c19',
 'c20',
 'c21',
 'c22',
 'c23',
 'c24',
 'c25']

In [5]:
# cast numerical is float
for c in train.columns[:14]:
    train = train.withColumn(c, train[c].cast('float'))
train.printSchema()

root
 |-- label: float (nullable = true)
 |-- n0: float (nullable = true)
 |-- n1: float (nullable = true)
 |-- n2: float (nullable = true)
 |-- n3: float (nullable = true)
 |-- n4: float (nullable = true)
 |-- n5: float (nullable = true)
 |-- n6: float (nullable = true)
 |-- n7: float (nullable = true)
 |-- n8: float (nullable = true)
 |-- n9: float (nullable = true)
 |-- n10: float (nullable = true)
 |-- n11: float (nullable = true)
 |-- n12: float (nullable = true)
 |-- c0: string (nullable = true)
 |-- c1: string (nullable = true)
 |-- c2: string (nullable = true)
 |-- c3: string (nullable = true)
 |-- c4: string (nullable = true)
 |-- c5: string (nullable = true)
 |-- c6: string (nullable = true)
 |-- c7: string (nullable = true)
 |-- c8: string (nullable = true)
 |-- c9: string (nullable = true)
 |-- c10: string (nullable = true)
 |-- c11: string (nullable = true)
 |-- c12: string (nullable = true)
 |-- c13: string (nullable = true)
 |-- c14: string (nullable = true)
 |-- c15: st

In [9]:
# grab a sample
s = train.sample(False, 0.001)
s.count()

45633

# Train/Test Split

In [18]:
trainSample, testSample = s.randomSplit([1.0, 9.0], 666)
trainSample = trainSample.cache()
testSample = testSample.cache()

In [11]:
# maybe explore caching?
trainSample.count(), testSample.count()

(4677, 40956)

# Normalize numerical data

In [21]:
%%time
maxes = []
mins = []
for c in trainSample.columns[1:14]:
    maxes.append(trainSample.agg({c: 'max'}).collect()[0])
    mins.append(trainSample.agg({c: 'max'}).collect()[0])
len(maxes), len(mins)

CPU times: user 240 ms, sys: 120 ms, total: 360 ms
Wall time: 7.76 s


In [33]:
%%time
stats = trainSample[trainSample.columns[1:14]].describe()
maxes = np.array(stats[stats['summary'] == 'max'].collect())[0][1:]
mins = np.array(stats[stats['summary'] == 'min'].collect())[0][1:]
maxes = [float(m) for m in maxes]
mins = [float(m) for m in mins]

CPU times: user 30 ms, sys: 20 ms, total: 50 ms
Wall time: 2.06 s


In [34]:
maxes

[206.0,
 6027.0,
 4312.0,
 88.0,
 1960323.0,
 10001.0,
 3359.0,
 90.0,
 3869.0,
 5.0,
 91.0,
 97.0,
 201.0]

In [37]:
mins

[0.0, -2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [35]:
# normalize columns
for i,c in enumerate(trainSample.columns[1:14]):
    trainSample = trainSample.withColumn(c, (trainSample[c] - mins[i]) / (maxes[i] - mins[i]))

In [36]:
trainSample.select(trainSample.columns[1:14]).show(vertical=True)

-RECORD 0-------------------
 n0  | null                 
 n1  | 1.658649859014762E-4 
 n2  | null                 
 n3  | null                 
 n4  | null                 
 n5  | null                 
 n6  | null                 
 n7  | null                 
 n8  | null                 
 n9  | null                 
 n10 | null                 
 n11 | null                 
 n12 | null                 
-RECORD 1-------------------
 n0  | null                 
 n1  | 1.658649859014762E-4 
 n2  | null                 
 n3  | null                 
 n4  | 0.001084005033864... 
 n5  | 0.0                  
 n6  | 0.002679368859779... 
 n7  | 0.0                  
 n8  | 0.003101576634789... 
 n9  | null                 
 n10 | 0.04395604395604396  
 n11 | null                 
 n12 | null                 
-RECORD 2-------------------
 n0  | null                 
 n1  | 1.658649859014762E-4 
 n2  | null                 
 n3  | null                 
 n4  | 0.005331774406564633 
 n5  | 0.0    

In [39]:
trainSample.head(1)

[Row(label=0.0, n0=None, n1=0.0001658649859014762, n2=None, n3=None, n4=None, n5=None, n6=None, n7=None, n8=None, n9=None, n10=None, n11=None, n12=None, c0='05db9164', c1='38a947a1', c2='374ffad0', c3='f40b64ab', c4='25c83c98', c5='6f6d9be8', c6='3d4f5cb7', c7='0b153874', c8='7cc72ec2', c9='3b08e48b', c10='53be0d4b', c11='a38368de', c12='8803181f', c13='07d13a8f', c14='851734f3', c15='b31a25af', c16='2005abd1', c17='67501981', c18=None, c19=None, c20='810cb11b', c21='8ec974f4', c22='423fab69', c23='772eb071', c24=None, c25=None)]

In [None]:
# need to decide on NA handling later on
?trainSample.na.fill

# Categorical manipulation

In [41]:
counts = trainSample.select(trainSample.columns[14:]).summary('count')

In [57]:
c = trainSample.groupBy('c0').count()
c.collect()[0]
# type(c)
# c = c.orderBy(c.count.desc()).collect()

Row(c0='7ceef477', count=1)

In [80]:
def findInfrequentValues(c, n=10):
    # c is the column that we are operating on
    # 
    counts = trainSample.groupBy(c).count()
    infrequentValues = counts.filter(counts['count'] <= n)
    s = infrequentValues.agg(F.collect_set(c)).collect()[0][0]
    return s

In [88]:
names = replaceInfrequentValues('c0')
type(names)

list

In [89]:
df = trainSample.withColumn('c0', F.when(trainSample['c0'] == names[0], '999').otherwise(trainSample['c0']))

In [95]:
df.select(df['c0'] == names[0]).collect()

[Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb90361)=False),
 Row((c0 = 1eb

In [96]:
names[0]

'1eb90361'

In [103]:
int('0x' + names[0], 16)

515441505

In [114]:
def hashValues(row):
    return int('0x' + row, 16)

In [115]:
udf_object = F.udf(hashValues)

In [116]:
df = trainSample.withColumn("new_c0", udf_object(trainSample['c0']))

In [119]:
for c in trainSample.columns[14:]:
    newCol = 'new_' + c
    trainSample = trainSample.withColumn(newCol, udf_object(trainSample[c]))

In [120]:
trainSample.select('new_c0').collect()

[Row(new_c0='98275684'),
 Row(new_c0='1761418852'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='1520359856'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='1761418852'),
 Row(new_c0='98275684'),
 Row(new_c0='2119970804'),
 Row(new_c0='1692891879'),
 Row(new_c0='1761418852'),
 Row(new_c0='98275684'),
 Row(new_c0='1520359856'),
 Row(new_c0='1520359856'),
 Row(new_c0='679372879'),
 Row(new_c0='1761418852'),
 Row(new_c0='967779847'),
 Row(new_c0='1520359856'),
 Row(new_c0='2364568165'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='2270503831'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='1617090627'),
 Row(new_c0='3568340312'),
 Row(new_c0='3193477969'),
 Row(new_c0='1543146165'),
 Row(new_c0='1520359856'),
 Row(new_c0='2270503831'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0='98275684'),
 Row(new_c0=