# Description

* techniques to clean data


-----

* filter examples here https://hackingandslacking.com/cleaning-pyspark-dataframes-1a3f5fdcedd1

# Imports/setup

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
import pyspark
import pandas as pd

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pd.set_option('display.float_format', '{:.2f}'.format)

In [4]:
import pyspark
sc = pyspark.SparkContext('local', appName='clean-data')

In [5]:
print(f'sc.environment:   {sc.environment}')
print(f'sc.applicationId: {sc.applicationId}')
print(f'sc.appName:       {sc.appName}')
print(f'sc.version:       {sc.version}')
print(f'sc.pythonVer:     {sc.pythonVer}')
print(f'sc.sparkHome:     {sc.sparkHome}')
print(f'sc.startTime:     {sc.startTime}')

sc.environment:   {'PYTHONHASHSEED': '0'}
sc.applicationId: local-1592493740148
sc.appName:       clean-data
sc.version:       2.4.5
sc.pythonVer:     3.7
sc.sparkHome:     None
sc.startTime:     1592493734496


In [6]:
from pyspark.sql import SparkSession

spark = (SparkSession
            .builder
            .appName('MAFIA pyspark sql session')
            .config('spark.some.config.option', 'some-value')
            .getOrCreate())

# Dummy data

In [21]:
df = (spark.read
      .option('header',True)
      .csv('dirty-data.csv'))
df.show()

+-------------+---+------+
|         name|age|person|
+-------------+---+------+
|         jack| 10|  True|
|         jill|  9|  True|
|humpty dumpty|egg| False|
|          red| 12|  True|
+-------------+---+------+



In [22]:
df.dtypes

[('name', 'string'), ('age', 'string'), ('person', 'string')]

In [23]:
from pyspark.sql.functions import col

In [31]:
df2 = (df.select(
    col('name'), 
    col('age').cast('float'),
    col('person').cast('boolean'))
)

print(df2.dtypes)
df2.show()

[('name', 'string'), ('age', 'float'), ('person', 'boolean')]
+-------------+----+------+
|         name| age|person|
+-------------+----+------+
|         jack|10.0|  true|
|         jill| 9.0|  true|
|humpty dumpty|null| false|
|          red|12.0|  true|
+-------------+----+------+



In [35]:
bool_feats = ['person']
num_feats = ['age']
cat_feats = ['name']

In [92]:
[col(c).cast('string').alias(f'{c}_cat') if c in bool_feats + cat_feats
                   else col(c).cast('float').alias(f'{c}_fl') if c in num_feats
                   else col(c).cast('string').alias(f'{c}_foo')
            for c in bool_feats + num_feats + cat_feats]

[Column<b'CAST(person AS STRING) AS `person_cat`'>,
 Column<b'CAST(age AS FLOAT) AS `age_fl`'>,
 Column<b'CAST(name AS STRING) AS `name_cat`'>]

In [94]:
df3 = df2.select([col(c).cast('string').alias(f'{c}_cat') if c in bool_feats + cat_feats
                   else col(c).cast('float').alias(f'{c}_fl') if c in num_feats
                   else col(c).cast('string').alias(f'{c}_foo')
            for c in bool_feats + num_feats + cat_feats])

print(df3.dtypes)
df3.show()

[('person_cat', 'string'), ('age_fl', 'float'), ('name_cat', 'string')]
+----------+------+-------------+
|person_cat|age_fl|     name_cat|
+----------+------+-------------+
|      true|  10.0|         jack|
|      true|   9.0|         jill|
|     false|  null|humpty dumpty|
|      true|  12.0|          red|
+----------+------+-------------+



In [49]:
from pyspark.sql.functions import expr

In [39]:
# this is harder to program for varying lists
df4 = (df2.select(
    col('person').cast('string'),
    col('name').cast('string'),
    col('age').cast('float')))
print(df4.dtypes)
df4.show()

[('person', 'string'), ('name', 'string'), ('age', 'float')]
+------+-------------+----+
|person|         name| age|
+------+-------------+----+
|  true|         jack|10.0|
|  true|         jill| 9.0|
| false|humpty dumpty|null|
|  true|          red|12.0|
+------+-------------+----+



I like this

In [93]:
def gen_select_items(feat_list, feat_type):
    """return a list of casted items to put in select call"""
    return [col(c).cast(feat_type) for c in feat_list]
    
def gen_select_all(cat_feats, num_feats):
    """return a list of columns to select, each cast as a string or float"""
    cat_feat_list = gen_select_items(cat_feats, 'string')
    num_feat_list = gen_select_items(num_feats, 'float')
    return cat_feat_list + num_feat_list

In [85]:
sel_items = gen_select_all(cat_feats=cat_feats + bool_feats,
               num_feats=num_feats)
sel_items

[Column<b'CAST(name AS STRING)'>,
 Column<b'CAST(person AS STRING)'>,
 Column<b'CAST(age AS FLOAT)'>]

In [86]:
df5 = df2.select(sel_items)
print(df4.dtypes)
df5.show()

[('person', 'string'), ('name', 'string'), ('age', 'float')]
+-------------+------+----+
|         name|person| age|
+-------------+------+----+
|         jack|  true|10.0|
|         jill|  true| 9.0|
|humpty dumpty| false|null|
|          red|  true|12.0|
+-------------+------+----+



In [88]:
df6 = (df2.select(
    col('person').cast(pyspark.sql.types.StringType),
    col('name').cast('string'),
    col('age').cast('float')))
print(df6.dtypes)
df6.show()

TypeError: unexpected type: <class 'type'>

# Filtering

In [59]:
df5.dropna().show()

+----+------+----+
|name|person| age|
+----+------+----+
|jack|  true|10.0|
|jill|  true| 9.0|
| red|  true|12.0|
+----+------+----+



In [62]:
df5.filter(df5.age < 10).show()

+----+------+---+
|name|person|age|
+----+------+---+
|jill|  true|9.0|
+----+------+---+



In [65]:
cond = df5.age < 10
df5.filter(cond).show()

+----+------+---+
|name|person|age|
+----+------+---+
|jill|  true|9.0|
+----+------+---+



In [67]:
df5.filter(col('age') < 10).show()

+----+------+---+
|name|person|age|
+----+------+---+
|jill|  true|9.0|
+----+------+---+



In [73]:
df5.filter(col('name').contains('j') | col('name').contains('d')).show()

+-------------+------+----+
|         name|person| age|
+-------------+------+----+
|         jack|  true|10.0|
|         jill|  true| 9.0|
|humpty dumpty| false|null|
|          red|  true|12.0|
+-------------+------+----+



In [77]:
df5.filter(col('name').like('%j%')).show()

+----+------+----+
|name|person| age|
+----+------+----+
|jack|  true|10.0|
|jill|  true| 9.0|
+----+------+----+



In [80]:
df5.filter(col('age').isin(10.0, 12)).show()

+----+------+----+
|name|person| age|
+----+------+----+
|jack|  true|10.0|
| red|  true|12.0|
+----+------+----+



In [83]:
df5.orderBy('person', 'name', ascending=[1,0]).show()

+-------------+------+----+
|         name|person| age|
+-------------+------+----+
|humpty dumpty| false|null|
|          red|  true|12.0|
|         jill|  true| 9.0|
|         jack|  true|10.0|
+-------------+------+----+



# schemas - doesn't work

In [95]:
df5.printSchema()

root
 |-- name: string (nullable = true)
 |-- person: string (nullable = true)
 |-- age: float (nullable = true)



In [103]:
list(map(lambda x: {x.name: x}, df5.schema.fields))

[{'name': StructField(name,StringType,true)},
 {'person': StructField(person,StringType,true)},
 {'age': StructField(age,FloatType,true)}]

In [116]:
raw_1 = (spark.read
      .option('header',True)
         .schema(df5.schema.simpleString())
      .csv('dirty-data.csv'))

ParseException: "\nextraneous input '<' expecting {'SELECT', 'FROM', 'ADD', 'AS', 'ALL', 'ANY', 'DISTINCT', 'WHERE', 'GROUP', 'BY', 'GROUPING', 'SETS', 'CUBE', 'ROLLUP', 'ORDER', 'HAVING', 'LIMIT', 'AT', 'OR', 'AND', 'IN', NOT, 'NO', 'EXISTS', 'BETWEEN', 'LIKE', RLIKE, 'IS', 'NULL', 'TRUE', 'FALSE', 'NULLS', 'ASC', 'DESC', 'FOR', 'INTERVAL', 'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'JOIN', 'CROSS', 'OUTER', 'INNER', 'LEFT', 'SEMI', 'RIGHT', 'FULL', 'NATURAL', 'ON', 'PIVOT', 'LATERAL', 'WINDOW', 'OVER', 'PARTITION', 'RANGE', 'ROWS', 'UNBOUNDED', 'PRECEDING', 'FOLLOWING', 'CURRENT', 'FIRST', 'AFTER', 'LAST', 'ROW', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'DIRECTORY', 'VIEW', 'REPLACE', 'INSERT', 'DELETE', 'INTO', 'DESCRIBE', 'EXPLAIN', 'FORMAT', 'LOGICAL', 'CODEGEN', 'COST', 'CAST', 'SHOW', 'TABLES', 'COLUMNS', 'COLUMN', 'USE', 'PARTITIONS', 'FUNCTIONS', 'DROP', 'UNION', 'EXCEPT', 'MINUS', 'INTERSECT', 'TO', 'TABLESAMPLE', 'STRATIFY', 'ALTER', 'RENAME', 'ARRAY', 'MAP', 'STRUCT', 'COMMENT', 'SET', 'RESET', 'DATA', 'START', 'TRANSACTION', 'COMMIT', 'ROLLBACK', 'MACRO', 'IGNORE', 'BOTH', 'LEADING', 'TRAILING', 'IF', 'POSITION', 'EXTRACT', 'DIV', 'PERCENT', 'BUCKET', 'OUT', 'OF', 'SORT', 'CLUSTER', 'DISTRIBUTE', 'OVERWRITE', 'TRANSFORM', 'REDUCE', 'SERDE', 'SERDEPROPERTIES', 'RECORDREADER', 'RECORDWRITER', 'DELIMITED', 'FIELDS', 'TERMINATED', 'COLLECTION', 'ITEMS', 'KEYS', 'ESCAPED', 'LINES', 'SEPARATED', 'FUNCTION', 'EXTENDED', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'LAZY', 'FORMATTED', 'GLOBAL', TEMPORARY, 'OPTIONS', 'UNSET', 'TBLPROPERTIES', 'DBPROPERTIES', 'BUCKETS', 'SKEWED', 'STORED', 'DIRECTORIES', 'LOCATION', 'EXCHANGE', 'ARCHIVE', 'UNARCHIVE', 'FILEFORMAT', 'TOUCH', 'COMPACT', 'CONCATENATE', 'CHANGE', 'CASCADE', 'RESTRICT', 'CLUSTERED', 'SORTED', 'PURGE', 'INPUTFORMAT', 'OUTPUTFORMAT', DATABASE, DATABASES, 'DFS', 'TRUNCATE', 'ANALYZE', 'COMPUTE', 'LIST', 'STATISTICS', 'PARTITIONED', 'EXTERNAL', 'DEFINED', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'REPAIR', 'RECOVER', 'EXPORT', 'IMPORT', 'LOAD', 'ROLE', 'ROLES', 'COMPACTIONS', 'PRINCIPALS', 'TRANSACTIONS', 'INDEX', 'INDEXES', 'LOCKS', 'OPTION', 'ANTI', 'LOCAL', 'INPATH', IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 6)\n\n== SQL ==\nstruct<name:string,person:string,age:float>\n------^^^\n"

In [111]:
df5.schema.fieldNames()

['name', 'person', 'age']

In [115]:
df5.schema.simpleString()

'struct<name:string,person:string,age:float>'

In [108]:
df

DataFrame[name: string, age: string, person: string]

In [119]:
[i for i in df5.schema]

[StructField(name,StringType,true),
 StructField(person,StringType,true),
 StructField(age,FloatType,true)]

In [120]:
df5.schema

StructType(List(StructField(name,StringType,true),StructField(person,StringType,true),StructField(age,FloatType,true)))