# setup

In [1]:
import re
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import types

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

# check file size

In [3]:
!wc -l data/train.txt

45840617 data/train.txt


In [4]:
!wc -l data/test.txt

6042135 data/test.txt


# experimenting and some EDA with DFs

In [5]:
train = spark.read.option('header', 'false').csv('data/train.txt', sep='\t')

In [6]:
train.write.format('parquet').save('data/train.parquet')
train_parquet = spark.read.parquet('data/train.parquet')

In [9]:
!du data/train.txt
!du data/train.parquet

10885924	data/train.txt
2895045	data/train.parquet


In [7]:
%%time
train.count()

CPU times: user 20 ms, sys: 60 ms, total: 80 ms
Wall time: 2min 47s


45840617

In [8]:
%%time
train_parquet.count()

CPU times: user 0 ns, sys: 10 ms, total: 10 ms
Wall time: 2.85 s


45840617

In [15]:
test = spark.read.option('header','false').csv('data/test.txt', sep='\t')
test.write.format('parquet').save('data/test.parquet')
del test
# test_parquet = spark.read.parquet('data/test.parquet')

In [12]:
# %%time
# test_parquet.count()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 327 ms


6042135

In [16]:
train = spark.read.parquet('data/train.parquet')
s = train.sample(False, 0.05)

In [11]:
# test = spark.read.parquet('data/test.parquet')

In [17]:
s.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)
 |-- _c23: string (nullable = true)
 |-- _c24: string (nullable = true)
 |-- _c25: string (nullable = true)
 |-- _c26: string (nullable = true)
 |-- _c27: string (nullable = tru

In [4]:
# df = spark.read.option('header', 'false').csv('data/toy_train.txt', sep='\t')
# df.head()

In [5]:
# %%time
# df.count()

In [6]:
# df.select(df.columns[2]).show(5)

In [7]:
# # df.write.format('parquet').save('data/toy_train.parquet')
# df = spark.read.parquet('data/toy_train.parquet')
# df.printSchema()

In [9]:
# for c in df.columns:
#     df = df.withColumnRenamed(c, c.strip('_'))
# df.columns[:5]

In [8]:
# for c in df.columns[:14]:
#     df = df.withColumn(c, df[c].cast('float'))
# df.printSchema()

In [21]:
for c in train.columns:
    train = train.withColumnRenamed(c, c.strip('_'))
train.columns[:5]

# for c in s.columns:
#     s = s.withColumnRenamed(c, c.strip('_'))
# s.columns[:5]

['c0', 'c1', 'c2', 'c3', 'c4']

In [22]:
%%time
for c in train.columns[:14]:
    train = train.withColumn(c, train[c].cast('float'))
train.printSchema()

# for c in s.columns[:14]:
#     s = s.withColumn(c, s[c].cast('float'))
# s.printSchema()

root
 |-- c0: float (nullable = true)
 |-- c1: float (nullable = true)
 |-- c2: float (nullable = true)
 |-- c3: float (nullable = true)
 |-- c4: float (nullable = true)
 |-- c5: float (nullable = true)
 |-- c6: float (nullable = true)
 |-- c7: float (nullable = true)
 |-- c8: float (nullable = true)
 |-- c9: float (nullable = true)
 |-- c10: float (nullable = true)
 |-- c11: float (nullable = true)
 |-- c12: float (nullable = true)
 |-- c13: float (nullable = true)
 |-- c14: string (nullable = true)
 |-- c15: string (nullable = true)
 |-- c16: string (nullable = true)
 |-- c17: string (nullable = true)
 |-- c18: string (nullable = true)
 |-- c19: string (nullable = true)
 |-- c20: string (nullable = true)
 |-- c21: string (nullable = true)
 |-- c22: string (nullable = true)
 |-- c23: string (nullable = true)
 |-- c24: string (nullable = true)
 |-- c25: string (nullable = true)
 |-- c26: string (nullable = true)
 |-- c27: string (nullable = true)
 |-- c28: string (nullable = true)
 |--

In [20]:
%%time
# stats = train[train.columns[1:14]].describe()
# means = np.array(stats[stats['summary'] == 'mean'].collect())[0][1:]
# stdevs = np.array(stats[stats['summary'] == 'stddev'].collect())[0][1:]

stats = s[s.columns[1:14]].describe()
means = np.array(stats[stats['summary'] == 'mean'].collect())[0][1:]
stdevs = np.array(stats[stats['summary'] == 'stddev'].collect())[0][1:]

CPU times: user 40 ms, sys: 10 ms, total: 50 ms
Wall time: 20.9 s


In [23]:
stats = train[train.columns[1:14]].describe()
stats.show(vertical=True)

-RECORD 0---------------------
 summary | count              
 c1      | 25047061           
 c2      | 45840617           
 c3      | 36001170           
 c4      | 35903248           
 c5      | 44657500           
 c6      | 35588289           
 c7      | 43857751           
 c8      | 45817844           
 c9      | 43857751           
 c10     | 25047061           
 c11     | 43857751           
 c12     | 10768965           
 c13     | 35903248           
-RECORD 1---------------------
 summary | mean               
 c1      | 3.5024133170754044 
 c2      | 105.84841979766546 
 c3      | 26.913041020611274 
 c4      | 7.322680248873305  
 c5      | 18538.991664871523 
 c6      | 116.06185085211598 
 c7      | 16.333130032135028 
 c8      | 12.517042137556713 
 c9      | 106.1098234380509  
 c10     | 0.6175294977722137 
 c11     | 2.7328343170173044 
 c12     | 0.9910356287721244 
 c13     | 8.217461161174054  
-RECORD 2---------------------
 summary | stddev             
 c1     

In [24]:
%%time
counts = []
for c in train.columns[14:]:
    count = train.agg(F.countDistinct(c).alias('c')).collect()[0]['c']
    counts.append(count)
counts = np.array(counts)
# number of unique hash  values
print(counts)

# counts = []
# for c in s.columns[14:]:
#     count = s.agg(F.countDistinct(c).alias('c')).collect()[0]['c']
#     counts.append(count)
# counts = np.array(counts)
# # number of unique hash  values
# print(counts)

[    1460      583 10131226  2202607      305       23    12517      633
        3    93145     5683  8351592     3194       27    14992  5461305
       10     5652     2172        3  7046546       17       15   286180
      104   142571]
CPU times: user 220 ms, sys: 180 ms, total: 400 ms
Wall time: 3min


In [15]:
c2 = s.select('c2').collect()
type(c2)

list

In [None]:
plt.hist(c2)

In [None]:
def plotter(data, labels):
    '''
    data must be a numpy.ndarray of shape (X, 96, 96, 1)
    labels must be a numpy.ndarray (X, 30)
    X must be any square number
    labels = true labels
    labesl2 = predicted labels (optional)
    '''
    
    dim = np.sqrt()
    fig = plt.figure(figsize=(10,10))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
    for i in range(len(labels)):
        img = data[i].reshape(96,96)
        ax = fig.add_subplot(dim, dim, i + 1, xticks=[], yticks=[])
        ax.imshow(data[i].reshape(96,96), cmap='gray')
        ax.scatter(labels[i][0::2] * 48 + 48, labels[i][1::2] * 48 + 48, c='r')
        if not labels2 is None:
            ax.scatter(labels2[i][0::2] * 48 + 48, labels2[i][1::2] * 48 + 48, c='b')



# Data maniplulation

In [9]:
for i,c in enumerate(train.columns[1:14]):
    train = train.withColumn(c, (train[c] - means[i]) / stdevs[i])

In [10]:
stats2 = train[train.columns[1:14]].describe()
stats2.show(vertical=True)

-RECORD 0-----------------------
 summary | count                
 c1      | 5851479              
 c2      | 10517430             
 c3      | 8259890              
 c4      | 8205798              
 c5      | 10229986             
 c6      | 8113327              
 c7      | 10051997             
 c8      | 10511722             
 c9      | 10051997             
 c10     | 5851479              
 c11     | 10051997             
 c12     | 2421098              
 c13     | 8205798              
-RECORD 1-----------------------
 summary | mean                 
 c1      | -7.97593112269365... 
 c2      | 5.828207091676494... 
 c3      | 3.909513938232698... 
 c4      | -3.74083514832785... 
 c5      | 9.537483029736827... 
 c6      | 3.244443654866041... 
 c7      | 9.026366645182133... 
 c8      | -4.24857390976072... 
 c9      | 6.576557065533054... 
 c10     | 1.379353179119628... 
 c11     | 4.190010407219867... 
 c12     | -1.01941175954099... 
 c13     | -4.75671009901150... 
-RECORD 2-

# Experimenting with RDDs

In [16]:
toyTrainRDD = sc.textFile('data/toy_train.txt')

In [17]:
toyTrainRDD.collect()[0]

'0\t\t1\t12\t1\t62526\t\t\t3\t\t\t\t\t3\t05db9164\t08d6d899\t9143c832\tf56b7dd5\t25c83c98\t7e0ccccf\t89a13b6b\t0b153874\t7cc72ec2\tc5fe5cb9\tfa365cf9\tae1bb660\td2420e4c\tb28479f6\tbffbd637\tbad5ee18\t07c540c4\tbbf70d82\t\t\t0429f84b\t\t32c7478e\tc0d61a5c\t\t'

In [21]:
line = toyTrainRDD.collect()[0]
line

'0\t\t1\t12\t1\t62526\t\t\t3\t\t\t\t\t3\t05db9164\t08d6d899\t9143c832\tf56b7dd5\t25c83c98\t7e0ccccf\t89a13b6b\t0b153874\t7cc72ec2\tc5fe5cb9\tfa365cf9\tae1bb660\td2420e4c\tb28479f6\tbffbd637\tbad5ee18\t07c540c4\tbbf70d82\t\t\t0429f84b\t\t32c7478e\tc0d61a5c\t\t'

In [22]:
line = line.split('\t')
line

['0',
 '',
 '1',
 '12',
 '1',
 '62526',
 '',
 '',
 '3',
 '',
 '',
 '',
 '',
 '3',
 '05db9164',
 '08d6d899',
 '9143c832',
 'f56b7dd5',
 '25c83c98',
 '7e0ccccf',
 '89a13b6b',
 '0b153874',
 '7cc72ec2',
 'c5fe5cb9',
 'fa365cf9',
 'ae1bb660',
 'd2420e4c',
 'b28479f6',
 'bffbd637',
 'bad5ee18',
 '07c540c4',
 'bbf70d82',
 '',
 '',
 '0429f84b',
 '',
 '32c7478e',
 'c0d61a5c',
 '',
 '']

In [12]:
def parse(line):
    line = line.split('\t')
    key = line[0]
    values = line[1:]
    for i,v in enumerate(values[:13]):
        v = 
    
    return key, values

In [14]:
toyTrainRDD = toyTrainRDD.map(parse)

toyTrainRDD.collect()[0]

('0',
 ('',
  '1',
  '12',
  '1',
  '62526',
  '',
  '',
  '3',
  '',
  '',
  '',
  '',
  '3',
  '05db9164',
  '08d6d899',
  '9143c832',
  'f56b7dd5',
  '25c83c98',
  '7e0ccccf',
  '89a13b6b',
  '0b153874',
  '7cc72ec2',
  'c5fe5cb9',
  'fa365cf9',
  'ae1bb660',
  'd2420e4c',
  'b28479f6',
  'bffbd637',
  'bad5ee18',
  '07c540c4',
  'bbf70d82',
  '',
  '',
  '0429f84b',
  '',
  '32c7478e',
  'c0d61a5c',
  '',
  ''))