# Python for Data Analysis - Workbook 3 (Data Cleaning and Preparation)

### Preliminaries

In [73]:
import numpy as np
import pandas as pd
import seaborn as sns

## Handling Missing Data

### Dropping null values

In [18]:
# Pandas defaults to showing numeric values as NaN. The Python 'None' value is also treated as NA (not available)
# Can test for null values with the isnull() method

string_data = pd.Series(['aardvark', 'artichoke', None, 'avocado'])
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
# Very easy to drop missing data in a Series using the dropna() method

data = pd.Series([1, np.nan, 3.5, 7])
print(data)
data.dropna()

0    1.0
1    NaN
2    3.5
3    7.0
dtype: float64


0    1.0
2    3.5
3    7.0
dtype: float64

In [9]:
# For DataFrames it is more complex, because Pandas automatically drops an entire row if it contains an NA value

df = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan]])
print(df)
df.dropna()

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
# Passing in the how ='all' flag will only drop rows that are all NA

df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,


In [15]:
# Can also column-wise drop instead of row-wise

df[4] = np.nan
print(df)
df.dropna(axis=1, how='all')

     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,


In [17]:
# Can also set a threshold for how many NAs in a row or column before you drop

df.dropna(axis = 1, thresh = 2)

Unnamed: 0,0
0,1.0
1,1.0
2,


### Filling Null Values

In [19]:
# Can also fill in for nulls using fillna()

df.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0


In [21]:
# if you pass a dict, it fills different values for each column or row (depending on what axis you specified)

df.fillna({0: 0, 1: -1, 2: -2, 4:-4})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,-4.0
1,1.0,-1.0,-2.0,-4.0
2,0.0,-1.0,-2.0,-4.0


In [24]:
# Can also use the same interpolation methods as with reindexing, like ffill and bfill, by specifying in 'method' arg

df.fillna(axis = 1, method = 'ffill', limit = 2)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,3.0
1,1.0,1.0,1.0,
2,,,,


In [27]:
# Can also get creative and use things like mean

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
print(df)

df.fillna(df.mean())


          0         1         2
0 -0.766894  0.128638  0.460981
1 -3.014465 -1.856439  1.800927
2 -0.091198       NaN  0.817156
3 -1.282198       NaN -2.058563
4  0.053238       NaN       NaN
5  0.060676       NaN       NaN


Unnamed: 0,0,1,2
0,-0.766894,0.128638,0.460981
1,-3.014465,-1.856439,1.800927
2,-0.091198,-0.8639,0.817156
3,-1.282198,-0.8639,-2.058563
4,0.053238,-0.8639,0.255125
5,0.060676,-0.8639,0.255125


### Data Transformation

#### Removing Duplicate Values

In [32]:
# duplicated() returns whether each row is a duplicate or not

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1,1,2,3,3,4,4]})
print(data)
data.duplicated()

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [34]:
# drop_duplicates returns a DF with only the rows where duplicated() returns false

data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [36]:
# Can also select a specific column or an array of columns if you only want a more fine-grained filter

data['v1'] = range(7)
print(data)
data.drop_duplicates(['k1'])

    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [38]:
# By default, these methods return the first observed value. Pass keep=last argument to return the last one instead

data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


#### Transform using a mapping or a function

In [133]:
# Load sample dataset

diamonds = sns.load_dataset('diamonds').head(10)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [135]:
# Can update column values by using the map() function - substitutes based on a dict

newNames = ['SuperDuper', 'Super', 'Yay', 'Meh', 'Really?']
mapping = dict(zip(diamonds['cut'].unique(), newNames))
diamonds['cut'].map(lambda x: mapping[x].lower())

0    superduper
1         super
2           yay
3         super
4           yay
5           meh
6           meh
7           meh
8       really?
9           meh
Name: cut, dtype: object

In [136]:
# Same idea but for a function, use apply()

diamonds['cut'].apply(lambda x:x.upper())

0        IDEAL
1      PREMIUM
2         GOOD
3      PREMIUM
4         GOOD
5    VERY GOOD
6    VERY GOOD
7    VERY GOOD
8         FAIR
9    VERY GOOD
Name: cut, dtype: object

#### Replacing Values

In [137]:
# Can change all instances of a given value with the replace() method. Like find & replace in Word or Excel
# Can also pass in a dict to make multiple replacements

print(diamonds['cut'].replace('Ideal', 'I deal'))
diamonds['cut'].replace({'Ideal': 'Super', 'Fair': 'Lame'})

0       I deal
1      Premium
2         Good
3      Premium
4         Good
5    Very Good
6    Very Good
7    Very Good
8         Fair
9    Very Good
Name: cut, dtype: object


0        Super
1      Premium
2         Good
3      Premium
4         Good
5    Very Good
6    Very Good
7    Very Good
8         Lame
9    Very Good
Name: cut, dtype: object

#### Rename Axis Indexes

In [76]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)), 
                    index= ['Ohio', 'Colorado', 'New York'], 
                    columns = ['one', 'two', 'three', 'four'])

In [77]:
# Like Series, axis indexes have a map method (returns a copy, can also use inplace to modify the original)

data.index.map(lambda x: x.upper())

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [80]:
# If you want to create a transformed version of the dataset without modifying original, use rename() 
# Note, not an index method but one you call on the DataFrame itself

data.rename(index=str.upper, columns=str.title)

Unnamed: 0,One,Two,Three,Four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [81]:
# Can also use rename() and pass in a dict to only change a subset

data.rename(index={'Ohio': 'Indiana'}, columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
Indiana,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


#### Discretization and Binning

In [91]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

# Let's say we want to divide into groups from 18-25, 26-35, 36-60, and 60-100

# First we create an array with our dividing lines
bins = [18, 25, 35, 60, 100]

# Then we use the cut() function to divide up the data according to our brackets
cats = pd.cut(ages, bins)
cats

# By default, the ranges are inclusive. To change pass right/left = False

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [92]:
# Pandas returns a special object called a Categorical 

print(cats.categories)
print('\n')
print(cats.codes)

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')


[0 0 0 1 0 0 2 1 3 2 2 1]


In [93]:
# Get bin counts from cut using value_counts()
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [94]:
# Can also pass in custom bin names using the labels argument

group_names = ['Youth', 'Young Adult', 'Middle-Aged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, Young Adult, Youth, ..., Young Adult, Senior, Middle-Aged, Middle-Aged, Young Adult]
Length: 12
Categories (4, object): [Youth < Young Adult < Middle-Aged < Senior]

In [96]:
# Passing in an integer number will compute equal length bins based on the max and min values of the data
# This works well for uniformly distributed data but won't help elsewhere in many cases
# Precision arg determines number of decimal places to account for

data = np.random.rand(20)
pd.cut(data, 4, precision = 2)

[(0.059, 0.29], (0.059, 0.29], (0.52, 0.75], (0.059, 0.29], (0.52, 0.75], ..., (0.75, 0.98], (0.75, 0.98], (0.75, 0.98], (0.059, 0.29], (0.75, 0.98]]
Length: 20
Categories (4, interval[float64]): [(0.059, 0.29] < (0.29, 0.52] < (0.52, 0.75] < (0.75, 0.98]]

In [100]:
# Similarly, qcut will bin data based on sample quartiles according to the distribution
# So for other distributions besides uniform, the bins should be roughply equal

data = np.random.randn(1000)

# Cut into quartiles
cats = pd.qcut(data, 4)
pd.value_counts(cats)

(0.724, 3.105]                   250
(0.106, 0.724]                   250
(-0.585, 0.106]                  250
(-2.8979999999999997, -0.585]    250
dtype: int64

#### Detecting and Filtering Outliers

In [113]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.018541,-0.030294,-0.004316,-0.025085
std,1.016067,0.982236,1.002975,0.989476
min,-2.909018,-3.473158,-3.111542,-3.004346
25%,-0.736096,-0.681495,-0.681144,-0.67982
50%,-0.010949,0.003335,-0.00395,-0.042036
75%,0.635153,0.669514,0.668312,0.68154
max,3.216408,2.832871,3.174571,3.1681


In [114]:
# Can use the any() method to find rows/columns where any value meets a given condition

data[(np.abs(data) > 3).any(axis = 1)]

Unnamed: 0,0,1,2,3
72,1.477917,-1.862123,0.518052,3.033005
204,1.4686,0.142101,0.16244,3.1681
344,3.216408,-0.015721,-1.047248,-0.935091
433,0.07223,-3.473158,3.174571,0.062559
435,-0.60216,-1.866691,-3.111542,-0.56696
451,3.197626,-0.269197,1.034405,-1.554013
500,2.427603,0.853248,-3.035457,0.363482
615,-1.336804,0.051877,3.030625,-0.524194
738,-1.171463,-0.13454,3.089165,-0.543379
860,0.250772,1.222599,-2.957142,-3.004346


In [116]:
# Can also set values this way, e.g. here's how you would cap values outside of the interval

data[np.abs(data) > 3] = np.sign(data) * 3
data.iloc[860]

0    0.250772
1    1.222599
2   -2.957142
3   -3.000000
Name: 860, dtype: float64

#### Permutation and Random Sampling

In [124]:
# Numpy has a method for random re-ordering called permuation()

df = pd.DataFrame(np.arange(5 * 4).reshape((5,4)))
print(df)
print('\n')
sampler = np.random.permutation(5) # Returns a range of 0-4 elements in a random order
print(sampler)

df.take(sampler)
# df.iloc[sampler] is equivalent


    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19


[1 4 0 3 2]


Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11


In [125]:
# Take a random subset using the sample method

df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3


In [128]:
# Can enable replace = True to allow for repeat choices

choices = pd.Series([5, 7, -1, 6, 4])
print(choices)
draws = choices.sample(n=10, replace=True)
draws

0    5
1    7
2   -1
3    6
4    4
dtype: int64


3    6
4    4
4    4
2   -1
3    6
0    5
3    6
1    7
0    5
0    5
dtype: int64

#### Computing Dummy/Indicator Variables

In [139]:
# Common statistical or ML task is converting a categorical variable into a binary indicator matrix
print(diamonds['cut'])
pd.get_dummies(diamonds['cut'])


0        Ideal
1      Premium
2         Good
3      Premium
4         Good
5    Very Good
6    Very Good
7    Very Good
8         Fair
9    Very Good
Name: cut, dtype: object


Unnamed: 0,Fair,Good,Ideal,Premium,Very Good
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,1,0,0,0
5,0,0,0,0,1
6,0,0,0,0,1
7,0,0,0,0,1
8,1,0,0,0,0
9,0,0,0,0,1


In [147]:
# Can also add a prefix for easy joining later

cuts = diamonds[['clarity', 'cut']]
dummies = pd.get_dummies(cuts, prefix='key')
print(dummies)
cuts.join(dummies)

   key_SI1  key_SI2  key_VS1  key_VS2  key_VVS1  key_VVS2  key_Fair  key_Good  \
0        0        1        0        0         0         0         0         0   
1        1        0        0        0         0         0         0         0   
2        0        0        1        0         0         0         0         1   
3        0        0        0        1         0         0         0         0   
4        0        1        0        0         0         0         0         1   
5        0        0        0        0         0         1         0         0   
6        0        0        0        0         1         0         0         0   
7        1        0        0        0         0         0         0         0   
8        0        0        0        1         0         0         1         0   
9        0        0        1        0         0         0         0         0   

   key_Ideal  key_Premium  key_Very Good  
0          1            0              0  
1          0          

Unnamed: 0,clarity,cut,key_SI1,key_SI2,key_VS1,key_VS2,key_VVS1,key_VVS2,key_Fair,key_Good,key_Ideal,key_Premium,key_Very Good
0,SI2,Ideal,0,1,0,0,0,0,0,0,1,0,0
1,SI1,Premium,1,0,0,0,0,0,0,0,0,1,0
2,VS1,Good,0,0,1,0,0,0,0,1,0,0,0
3,VS2,Premium,0,0,0,1,0,0,0,0,0,1,0
4,SI2,Good,0,1,0,0,0,0,0,1,0,0,0
5,VVS2,Very Good,0,0,0,0,0,1,0,0,0,0,1
6,VVS1,Very Good,0,0,0,0,1,0,0,0,0,0,1
7,SI1,Very Good,1,0,0,0,0,0,0,0,0,0,1
8,VS2,Fair,0,0,0,1,0,0,1,0,0,0,0
9,VS1,Very Good,0,0,1,0,0,0,0,0,0,0,1


In [151]:
# A useful statistical recipe is to combine get_dummies with cut() to discretize

np.random.seed(12345)
values = np.random.rand(10)

values

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


### String Manipulations

In [152]:
# split a string based on a delimiter

val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

In [153]:
# strip to trim whitespace

[x.strip() for x in val.split(',')]

# lstrip/rstrip do one side or the other

['a', 'b', 'guido']

In [154]:
# can join the pieces back together with a new delimter using join()

pieces = [x.strip() for x in val.split(',')]
'::'.join(pieces)

'a::b::guido'

In [155]:
# finding substrings using the 'in' keyword

'guido' in val

True

In [158]:
# Can also use index and find methods to get the index where the string starts

print(val.find('guido'))
print(val.index('guido'))

# Note, if the string isn't found, index will return an exception vs. a -1 value

5
5


In [159]:
# Replace

val.replace('guido', 'dido')

'a,b, dido'

In [161]:
# Count number of occurrences of substring

text = 'store data in the form of rectangular grids by which the data can be over viewed easily. Each row of the rectangular grid contains values of an instance, and each column of the grid is a vector which holds data for a specific variable. This means that rows of a DataFrame do not need to contain, values of same data type, they can be numeric, character, logical, etc. DataFrames for Python come with the Pandas library, and they are defined as two-dimensional labeled data structures with potentially different types of columns.'

text.count('data')

5

In [163]:
# rfind() finds the position of the first index of the last occurrence of the substring, vs. find which finds first

print(text.find('data'))
text.rfind('data')

6


468

#### Regular Expressions

In [165]:
# Python's built in module for handling regular expressions

import re

In [177]:
# Regular expressions provide a flexible way to search or match for more complex patterns
# More here: https://www.w3schools.com/python/python_regex.asp

text = "foo   bar\t baz \tqux"

In [None]:
re.split(r'\s+', text) # split where there are any number of spaces

In [181]:
# Can precompile for better performance

regex = re.compile(r'\s+')
regex.split(text)
# Prepend the r character to use a raw string literal to avoid unwanted escaping

['foo', 'bar', 'baz', 'qux']

In [182]:
emails = """Dave dave@google.com 
Steve steve@gmail.com 
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [187]:
# Define the search pattern

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags = re.IGNORECASE)

In [189]:
# findall() produces the full list of matches

regex.findall(emails)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [190]:
# search returns a match object for the first email in the text and also the starting&ending positions of the pattern

regex.search(emails)

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>