In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'handlingMissing'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [6]:
# load the data
fil = '../data/zomato.csv'
schem = StructType([StructField('url', StringType()), StructField('address', StringType()),
                    StructField('name', StringType()), StructField('online_order', StringType()),
                    StructField('book_table', StringType()), StructField('rate', StringType()),
                    StructField('votes', IntegerType()), StructField('phone', StringType()),
                    StructField('location', StringType()), StructField('rest_type', StringType()),
                    StructField('dish_liked', StringType()), StructField('cuisines', StringType()),
                    StructField('approx_cost(for two people)', FloatType()), StructField('reviews_list', StringType()),
                    StructField('menu_item', StringType()), StructField('listed_in(type)', StringType()),
                    StructField('listed_in(city)', StringType())])
zomato = spark.read.format('csv').options(header=True).schema(schem).load(fil)\
    .withColumn('onlineOrder', col('online_order')=='Yes').withColumn('bookTable', col('online_order')=='Yes')\
    .drop('online_order', 'book_table')

# talk
display(zomato.limit(10).toPandas())

Unnamed: 0,url,address,name,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city),onlineOrder,bookTable
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,4.1/5,775.0,080 42297555,,,,,,,,,,True,True
1,"+91 9743772233""",Banashankari,Casual Dining,800,,('Rated 4.0','RATED\n You canÃ\x83Ã\x83Ã\x82Ã\x82Ã\x...,('Rated 5.0','RATED\n Overdelighted by the service and fo...,('Rated 4.0',,('Rated 4.0','RATED\n The place is nice and comfortable. ...,('Rated 4.0','RATED\n The place is nice and comfortable. ...,False,False
2,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,4.1/5,787.0,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800.0,"""[('Rated 4.0', 'RATED\n Had been here for di...",rice was well cooked and overall was great\n\n...,('Rated 5.0','RATED\n This place just cool ? with good am...,True,True
3,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,3.8/5,918.0,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800.0,"""[('Rated 3.0', """"RATED\n Ambience is not tha...",('Rated 3.0',"""""RATED\n \nWent there for a quick bite with ...",pasta churros and lasagne.\n\nNachos were pat...,True,True
4,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,3.7/5,88.0,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300.0,"""[('Rated 4.0', """"RATED\n Great food and prop...",('Rated 2.0','RATED\n Reached the place at 3pm on Saturda...,('Rated 4.0',False,False
5,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,3.8/5,166.0,+91 8026612447,,,,,,,,,,False,False
6,"+91 9901210005""",Basavanagudi,Casual Dining,600,,[],Buffet,Banashankari,,,,,,,,False,False
7,https://www.zomato.com/bangalore/timepass-dinn...,"37, 5-1, 4th Floor, Bosco Court, Gandhi Bazaar...",Timepass Dinner,3.8/5,286.0,+91 9980040002,,,,,,,,,,True,True
8,"+91 9980063005""",Basavanagudi,Casual Dining,600,,[],Buffet,Banashankari,,,,,,,,False,False
9,https://www.zomato.com/bangalore/rosewood-inte...,"19/1, New Timberyard Layout, Beside Satellite ...",Rosewood International Hotel - Bar & Restaurant,3.6/5,8.0,+91 9731716688,,,,,,,,,,False,False


In [19]:
# count nulls per column
cnt = zomato.count()
nullCounts = {colm:zomato.select(colm).where(col(colm).isNull()).count() for colm in zomato.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
print(nullCounts)

{'url': (0, 0.0), 'address': (0, 0.0), 'name': (85, 0.0011849993029415865), 'rate': (7775, 0.10839258329848041), 'votes': (20018, 0.2790743064268786), 'phone': (1227, 0.017105813467168548), 'location': (20054, 0.27957618848459503), 'rest_type': (20165, 0.281123658162554), 'dish_liked': (46841, 0.6530182629304335), 'cuisines': (27305, 0.3806635996096473), 'approx_cost(for two people)': (43611, 0.6079882894186532), 'reviews_list': (28185, 0.3929318276871602), 'menu_item': (28611, 0.398870765370138), 'listed_in(type)': (28983, 0.4040568799665412), 'listed_in(city)': (29344, 0.4090896417119755), 'onlineOrder': (8111, 0.11307681583716715), 'bookTable': (8111, 0.11307681583716715)}


In [46]:
# pretty print
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in zomato.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])
display(nullCountsDF)

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,dish_liked,46841.0,0.653018,StringType
1,approx_cost(for two people),43611.0,0.607988,FloatType
2,listed_in(city),29344.0,0.40909,StringType
3,listed_in(type),28983.0,0.404057,StringType
4,menu_item,28611.0,0.398871,StringType
5,reviews_list,28185.0,0.392932,StringType
6,cuisines,27305.0,0.380664,StringType
7,rest_type,20165.0,0.281124,StringType
8,location,20054.0,0.279576,StringType
9,votes,20018.0,0.279074,IntegerType


### dropping nulls

In [34]:
# drop all rows with all missing data
noNull = zomato.dropna(how='all')
print('Dataset reduced from %d to %d'%(cnt, noNull.count()))

Dataset reduced from 71730 to 71730


In [29]:
# drop all rows with any missing data
noNull = zomato.dropna()
print('Dataset reduced from %d to %d'%(cnt, noNull.count()))

Dataset reduced from 71730 to 8606


In [31]:
# drop all rows with more than numCols-N missing
N = 8
noNull = zomato.dropna(thresh = N)
print('Dataset reduced from %d to %d'%(cnt, noNull.count()))

Dataset reduced from 71730 to 65842


In [33]:
# drop rows only if null in votes
noNull = zomato.dropna(subset = 'votes')
print('Dataset reduced from %d to %d'%(cnt, noNull.count()))

Dataset reduced from 71730 to 51712


### filling nulls

In [48]:
filld = zomato.fillna({'dish_liked':'MISSING', 'approx_cost(for two people)':42, 'onlineOrder':False})
filld.select('dish_liked', 'approx_cost(for two people)', 'onlineOrder').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------+-----------+
|dish_liked                                                                                                                                                                                                                                                       |approx_cost(for two people)|onlineOrder|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------+-----------+
|MISSING                                                                                            

In [76]:
# fill nulls with the non-null mean: means are 283 and 387 (approx)
noNull = zomato.fillna(zomato.agg(*(avg(colm.name).alias(colm.name) for colm in zomato.schema
                                    if (colm.dataType is IntegerType()) or (colm.dataType is FloatType()))).first().asDict())
noNull.select('votes', 'approx_cost(for two people)').show()

+-----+---------------------------+
|votes|approx_cost(for two people)|
+-----+---------------------------+
|  775|                   387.4084|
|  283|                   387.4084|
|  787|                      800.0|
|  918|                      800.0|
|   88|                      300.0|
|  166|                   387.4084|
|  283|                   387.4084|
|  286|                   387.4084|
|  283|                   387.4084|
|    8|                   387.4084|
|  283|                   387.4084|
| 2556|                   387.4084|
|  283|                   387.4084|
|  324|                   387.4084|
|  283|                   387.4084|
|  504|                   387.4084|
|  283|                   387.4084|
|  402|                   387.4084|
|  283|                   387.4084|
|  150|                      600.0|
+-----+---------------------------+
only showing top 20 rows



In [77]:
sc.stop()