# Data Analysis with Pandas

## Data Cleaning and Preparation

## Handling Missing Data

***For numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data***

In [1]:
import pandas as pd
import numpy as np
from numpy import nan as NA

In [2]:
data = pd.DataFrame([[1., 6.5, 3.0], [1., NA, NA],
                    [NA, NA, NA],[NA, 6.5, 3.]],
                   columns = list('abc'))

print(data)

     a    b    c
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [3]:
# Now, we used 'display()' command to print the DataFrame rather than using simple 'print' statement
# 'display()' statement display the data in formatted style

display(data)

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [4]:
# 'dropna()' is used to skip any row which keeps any value representing 'NA', 'NaN', 'Null'

cleaned = data.dropna()

display(cleaned)

Unnamed: 0,a,b,c
0,1.0,6.5,3.0


In [5]:
# when 'dropna()' is used with parameter 'how=all' it skips only those rows where all 'NA', 'NaN' or 'null' values found
# So, it skipped only row at index-2, which keeps all values as 'NaN'
# 'how = all' remove those only rows which contains all data as 'NaN'

cleaned = data.dropna(how='all')

display(cleaned)

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [6]:
data = pd.DataFrame([[1., 6.5, NA], [1., NA, NA],
                    [NA, NA, NA],[3., 6.5, NA]],
                   columns = list('abc'))

print(data)

     a    b   c
0  1.0  6.5 NaN
1  1.0  NaN NaN
2  NaN  NaN NaN
3  3.0  6.5 NaN


In [7]:
# dropping columns that have all null values by using 'dropna(how='all')'

data = data.dropna(thresh=2)

display(data)

Unnamed: 0,a,b,c
0,1.0,6.5,
3,3.0,6.5,


In [8]:
# dropping columns that have all null values by using 'dropna(how='all')'

data = data.dropna(axis=1, how='all')

display(data)

Unnamed: 0,a,b
0,1.0,6.5
3,3.0,6.5


In [9]:
# Here, 'dropna(how='all')' is not applied, so any row having any of 'NaN' value is skipped
# Hense, all fields contains atleast one 'NaN' value index, therefore, all columns are skipped.

data = data.dropna()

display(data)

Unnamed: 0,a,b
0,1.0,6.5
3,3.0,6.5


#### Note:  'dropna(how=all)' is equivalent to 'notnull()'

## Filling of Data

In [10]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,1.221874,-0.392793,1.297711
1,-1.019723,-1.086583,-0.853286
2,0.474872,1.766249,0.308454
3,2.504973,-2.229477,-0.735491
4,0.2397,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [11]:
# This will select first four rows of column-name '1'

print(df.iloc[:4, 1])
print()
print(df.iloc[:2, 2])

0   -0.392793
1   -1.086583
2    1.766249
3   -2.229477
Name: 1, dtype: float64

0    1.297711
1   -0.853286
Name: 2, dtype: float64


In [12]:
# assign 'NA' to first four rows of column-name '1'
# Also assign 'NA' to frist two rows of column-name '2'

from numpy import nan as NA

df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

display(df)

Unnamed: 0,0,1,2
0,1.221874,,
1,-1.019723,,
2,0.474872,,0.308454
3,2.504973,,-0.735491
4,0.2397,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [13]:
# This will fill all 'NaN' value fields with value '1.5'

df.fillna(1.5)

Unnamed: 0,0,1,2
0,1.221874,1.5,1.5
1,-1.019723,1.5,1.5
2,0.474872,1.5,0.308454
3,2.504973,1.5,-0.735491
4,0.2397,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [14]:
# Display of original DataFrame i.e. 'df'

display(df)

Unnamed: 0,0,1,2
0,1.221874,,
1,-1.019723,,
2,0.474872,,0.308454
3,2.504973,,-0.735491
4,0.2397,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [15]:
# This will fill 'NaN' values of Column-name '1' with value '0.5' & column-name '2' with value '0'

df = df.fillna({1: 0.5, 2: 0})

display(df)

Unnamed: 0,0,1,2
0,1.221874,0.5,0.0
1,-1.019723,0.5,0.0
2,0.474872,0.5,0.308454
3,2.504973,0.5,-0.735491
4,0.2397,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [16]:
# Again values set to 'NaN' for doing practice

df.iloc[2:5, 0] = NA
df.iloc[1:4, 1] = NA
df.iloc[:3, 2] = NA

display(df)

Unnamed: 0,0,1,2
0,1.221874,0.5,
1,-1.019723,,
2,,,
3,,,-0.735491
4,,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [17]:
# This 'ffill' method will fill from 'left to right' mean at 'axis-1' with available value at already filled index
# This filled 'index-0' of column-name '2' with value '0.5'
# This also filled 'index-1' of column-name '1' and also 'index-1' of column-name '2' with value '0.583010' which is originally
# value of 'index-1' of column-name '0'

df = df.fillna(method='ffill', axis = 1)
display(df)

Unnamed: 0,0,1,2
0,1.221874,0.5,0.5
1,-1.019723,-1.019723,-1.019723
2,,,
3,,,-0.735491
4,,0.448887,-1.542888
5,-0.291139,0.806686,0.09855
6,0.313141,-0.597331,0.777537


In [18]:
# Again created DataFrame

df = pd.DataFrame(np.random.randn(7,3))

display(df)

Unnamed: 0,0,1,2
0,-0.698284,0.226481,0.547044
1,-0.067309,0.94581,1.089944
2,0.103308,-0.592502,0.362459
3,-0.246093,-0.953565,1.68209
4,1.621142,0.180626,-1.221957
5,-0.123453,-0.153898,0.040066
6,2.206479,-0.571522,-1.073265


In [19]:
# Again values set to 'NaN' for doing practice

df.iloc[0:3, 1] = NA
df.iloc[2:5, 2] = NA

display(df)

Unnamed: 0,0,1,2
0,-0.698284,,0.547044
1,-0.067309,,1.089944
2,0.103308,,
3,-0.246093,-0.953565,
4,1.621142,0.180626,
5,-0.123453,-0.153898,0.040066
6,2.206479,-0.571522,-1.073265


In [20]:
# It will have no any significat affect on values of DataFrame i.e. df due to parameter 'limit=1'

df.fillna(method='ffill', limit=1, axis=1)
display(df)

Unnamed: 0,0,1,2
0,-0.698284,,0.547044
1,-0.067309,,1.089944
2,0.103308,,
3,-0.246093,-0.953565,
4,1.621142,0.180626,
5,-0.123453,-0.153898,0.040066
6,2.206479,-0.571522,-1.073265


In [21]:
# Here, we used the above code without parameter 'limit=1'
# It also has no affect to fill the 'NA' values in give DataFrame i.e. 'df'

df.fillna(method='ffill', axis=1)
display(df)

Unnamed: 0,0,1,2
0,-0.698284,,0.547044
1,-0.067309,,1.089944
2,0.103308,,
3,-0.246093,-0.953565,
4,1.621142,0.180626,
5,-0.123453,-0.153898,0.040066
6,2.206479,-0.571522,-1.073265


## Removing Duplicate Values

In [22]:
# Created a DataFrame i.e. 'data_df' using python dictinery

data_df = pd.DataFrame({'key1':['one', 'two'] * 3 + ['two'],
                        'key2':[1, 1, 2, 3, 3, 4, 4]})

display(data_df)

Unnamed: 0,key1,key2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [23]:
# This will duplicated values in DataFrame
# 'duplicated' mean full row should be duplicate of upper level row
# This will display that only value 'two' of index-6' is duplicated because both have same values at 'key1' & 'key2'

data_df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [24]:
# This will drop the duplicated rows
# this is not in place operation, so it will not store dropped values in the original DataFrame

data_df.drop_duplicates()

Unnamed: 0,key1,key2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [25]:
# So, display(data_df) will show again the duplicated rows

display(data_df)

Unnamed: 0,key1,key2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [26]:
# This will not only drop duplicate rows, but in addition it will also update the DataFram being an in-place operation

data_df = data_df.drop_duplicates()

display(data_df)

Unnamed: 0,key1,key2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [27]:
# Again created a DataFrame

data2_df = pd.DataFrame({'key1':['one', 'two'] * 3 + ['two'],
                        'key2':[1, 1, 2, 3, 3, 4, 4]})

display(data2_df)

Unnamed: 0,key1,key2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [28]:
# A column is created

data2_df['v1'] = ['one', 'two', 'one', 'four', 'one', 'six', 'two']

display(data2_df)

Unnamed: 0,key1,key2,v1
0,one,1,one
1,two,1,two
2,one,2,one
3,two,3,four
4,one,3,one
5,two,4,six
6,two,4,two


In [29]:
# This will drop duplicates from only column-labels 'key1' & 'v1' and will not from column-label 'key2'
# This will drop rows with 'index-2, 4 & 6' as these are duplicate of row-0 with reference to column-names 'key1 & v1'

data2_df.drop_duplicates(['key1', 'v1'])

Unnamed: 0,key1,key2,v1
0,one,1,one
1,two,1,two
3,two,3,four
5,two,4,six


In [30]:
# This will drop duplicates from only column-labels 'key1' & 'v1' and will not from column-label 'key2'
# This will drop rows with 'index-2, 4 & 6' as these are duplicate of row-0 with reference to column-names 'key1 & v1'


data2_df = data2_df.drop_duplicates(['key1', 'v1'])

display(data2_df)        # This will display leftover row here

Unnamed: 0,key1,key2,v1
0,one,1,one
1,two,1,two
3,two,3,four
5,two,4,six


## Replace Values

***Pandas DataFrame 'replace()' function works both for replacing 'Null' values as well as 'Not Null' values***

In [31]:
# Again created DataFrame

df = pd.DataFrame(np.random.randn(7,3))

display(df)

Unnamed: 0,0,1,2
0,0.006116,0.903394,0.499502
1,0.118458,0.998334,1.323618
2,-1.012796,-1.335783,0.713992
3,-0.014995,-0.878672,1.370064
4,-0.881794,0.624649,-0.623661
5,-0.439731,0.399624,1.378659
6,1.813064,2.268604,-0.706032


In [32]:
# assign 'NA' to first four rows of column-name '1'
# Also assign 'NA' to frist two rows of column-name '2'

df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

display(df)

Unnamed: 0,0,1,2
0,0.006116,,
1,0.118458,,
2,-1.012796,,0.713992
3,-0.014995,,1.370064
4,-0.881794,0.624649,-0.623661
5,-0.439731,0.399624,1.378659
6,1.813064,2.268604,-0.706032


In [33]:
# 'replace()' function of Pandas DataFrame replaces 'Null' as well as 'Not Null' values
# Here, this will replace 'NaN' values in DataFrame i.e. 'df' with '-999'

df = df.replace(np.nan, -999)

display(df)       # this will display the revised values after replacing 'NaN' with '-999'

Unnamed: 0,0,1,2
0,0.006116,-999.0,-999.0
1,0.118458,-999.0,-999.0
2,-1.012796,-999.0,0.713992
3,-0.014995,-999.0,1.370064
4,-0.881794,0.624649,-0.623661
5,-0.439731,0.399624,1.378659
6,1.813064,2.268604,-0.706032


In [34]:
# This will further replace '-999' with '9'

df = df.replace(-999, -9)

display(df)       # this will display the DataFrame after replacing all '-999' values with '-9'

Unnamed: 0,0,1,2
0,0.006116,-9.0,-9.0
1,0.118458,-9.0,-9.0
2,-1.012796,-9.0,0.713992
3,-0.014995,-9.0,1.370064
4,-0.881794,0.624649,-0.623661
5,-0.439731,0.399624,1.378659
6,1.813064,2.268604,-0.706032


### Important
- We can also do multiple replace using this 'replace()' function as follows:
    - Suppose the above case, both has values of 'NaN' and '-999', so we will execute replace function as:
    - df.replace([-999, -9], [np.nan, 0])

## Renaming Indexes

In [35]:
# A new DataFrame is created

dFrame = pd.DataFrame(np.arange(12).reshape(3,4),
                     columns = ['one', 'two', 'three', 'four'],
                     index = ['Ohio', 'Colorado', 'New York'])

display(dFrame)

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [36]:
# 'lambda' function is created to replace indexes

transform = lambda x: x[:].upper()

dFrame = dFrame.index.map(transform)

display(dFrame)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

## Detecting Filtering Outliers

#### Outliers
    - Outliers are the data out of range
    - For example, age of students is collected from user and he/she inputs age as '0' are '1000', so this will be called as outliers
    - Data Outliers are handled with 'Detecting Filters'

In [84]:
# Created a Pandas DataFrame using 'np.random.randn()'

dFrame2 = pd.DataFrame(np.random.randn(1000,4))

dFrame2.head()      # this will display top-five rows of DataFrame

Unnamed: 0,0,1,2,3
0,-0.219505,0.502767,-1.117451,0.174183
1,0.710636,-0.598825,-0.06606,-0.008149
2,-1.03449,0.652384,1.069814,0.165874
3,0.257211,0.313655,-0.492335,-0.615343
4,-0.637488,-0.905323,0.354991,0.014481


In [85]:
# Now, if we wish to analyse this large data to find any 'outlier', we will use 'describe()'
# this will display the summary of DataFrame

dFrame2.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.011162,-0.01931,0.030587,-0.028056
std,1.016289,0.98568,0.975914,0.972719
min,-3.056038,-2.714616,-2.864473,-3.547752
25%,-0.684483,-0.703466,-0.63375,-0.662402
50%,-0.033619,-0.021415,0.041093,-0.043727
75%,0.678394,0.644598,0.706043,0.61191
max,3.291386,3.274745,2.948336,3.98928


In [86]:
# This will store 'column-2' of DataFrame in a variable 'col' to further analyse

col = dFrame2[2]

print(col)

0     -1.117451
1     -0.066060
2      1.069814
3     -0.492335
4      0.354991
         ...   
995   -0.733369
996    0.807320
997    0.191016
998   -0.856284
999   -0.801283
Name: 2, Length: 1000, dtype: float64


In [87]:
# This will select all rows having more than 3 values

col[np.abs(col) > 3]             # boolean

Series([], Name: 2, dtype: float64)

In [88]:
#To select all rows having a value exceeding 3 or –3, you can use the 'any' method on a boolean DataFrame:
#any method will check all cells 

outliers = dFrame2[(np.abs(dFrame2) > 3).any(1)]
print()

print(len(outliers), len(dFrame2))

display(outliers)


9 1000


Unnamed: 0,0,1,2,3
44,0.953035,-0.554971,-1.372989,-3.151852
93,1.209302,-1.262208,-1.840725,3.98928
201,0.220167,0.963058,-0.440705,-3.547752
338,-0.583195,3.274745,-1.342721,1.999307
363,0.852843,3.152151,0.731522,-0.011547
491,3.291386,-1.229103,-0.642449,-0.780475
558,-0.876654,3.040167,1.467735,-1.792844
934,-3.056038,-1.473618,-1.591261,0.218455
985,3.168767,0.605599,-0.151459,0.591474


In [90]:
# check the difference

print(dFrame2)
print()

data[(np.abs(dFrame2) > 3)]

            0         1         2         3
0   -0.219505  0.502767 -1.117451  0.174183
1    0.710636 -0.598825 -0.066060 -0.008149
2   -1.034490  0.652384  1.069814  0.165874
3    0.257211  0.313655 -0.492335 -0.615343
4   -0.637488 -0.905323  0.354991  0.014481
..        ...       ...       ...       ...
995  0.036384 -0.898693 -0.733369 -1.410731
996 -0.831637 -0.266791  0.807320 -1.026614
997 -0.356253 -0.988738  0.191016 -0.324261
998 -0.448288  0.014342 -0.856284  0.171193
999  0.686490  0.351366 -0.801283 -1.053335

[1000 rows x 4 columns]



Unnamed: 0,a,b
0,,
3,,


## Permutation and Random Sampling

### Permutation mean Data Re-ordering not any Change in Data

In [92]:
# Here, another DataFrame is created

dFrame3 = pd.DataFrame(np.arange(5 * 4).reshape(5,4))

print(dFrame3.shape)

print()

display(dFrame3)

(5, 4)



Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [116]:
# This function 'permutation()' is used for sampling purpose

sampler = np.random.permutation(4)
df = dFrame3[sampler]

display(df)

Unnamed: 0,2,3,0,1
0,2,3,0,1
1,6,7,4,5
2,10,11,8,9
3,14,15,12,13
4,18,19,16,17


In [119]:
df.take(sampler)

Unnamed: 0,2,3,0,1
2,10,11,8,9
3,14,15,12,13
0,2,3,0,1
1,6,7,4,5


# Regular Expressions & Vectorized Function

## 1- Regular Expression

    - Regular expressions provide a flexible way to search or match (often more complex) string patterns in text.
    - A single expression, commonly called a regex, is a string formed according to the regular expression language.
    - Python’s built-in re module is responsible for applying regular expressions to strings:
    - The re module functions fall into three categories: 
            - Pattern Matching, 
            - Substitution,
            - Splitting
    - When you call re.split('\s+', text) , the regular expression is first compiled, and then its split method is 
    called on the passed text. You can compile the regex yourself with re.compile , forming a reusable regex object.

In [127]:
import re

text = "foo     bar\t baz  \tqux"
re.split('\s+', text)               # spliting based on whitespaces & tabs

['foo', 'bar', 'baz', 'qux']

In [126]:
# This will split based on whitespaces between the text string

text.split(" ")

['foo', '', '', '', '', 'bar\t', 'baz', '', '\tqux']

In [128]:
# This is normal split function 'split()'

normal_split = text.split()
normal_split

['foo', 'bar', 'baz', 'qux']

In [129]:
# Compile once to use again and again and save time

rgx = re.compile('\s+')
rgx.split(text)

['foo', 'bar', 'baz', 'qux']

## 2- Vectorize String Functions in Pandas

In [130]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}

data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

In [131]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

#data.str.findall(pattern, flags=re.IGNORECASE)

matches = data[data.str.match(pattern, flags=re.IGNORECASE)]

matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

#### At this point, Practice of Lesson-4 Videos of PIAIC Student Portal is completed.