In [1]:
# Data Loading, Storage, and File Formats
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\pandas_ex1.csv')

In [3]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
# overriding the column names
df= pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\pandas_ex1.csv', 
             names=['Col1', 'Col2', 'Col3', 'Col4', 'message'])
df

Unnamed: 0,Col1,Col2,Col3,Col4,message
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


In [5]:
# No headers
df= pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\pandas_ex1.csv', 
             header= None)
df

Unnamed: 0,0,1,2,3,4
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


In [6]:
# Suppose you wanted the message column to be the index
names = ['Col1', 'Col2', 'Col3', 'Col4', 'message']
df= pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\pandas_ex1.csv', 
                 names= names, index_col= 'message')
df

Unnamed: 0_level_0,Col1,Col2,Col3,Col4
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
message,a,b,c,d
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [7]:
# Handling missing values
# Missing data is usually either not present (empty string) or marked by
# some sentinel value. By default, pandas uses a set of commonly occurring 
# sentinels, such as NA and NULL:

In [11]:
df = pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\pandas_ex2.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2.0,3.0,4,
1,two,5.0,6.0,,8,world
2,three,9.0,10.0,11.0,12,foo
3,four,,,4.0,Globe,
4,five,,,,,Earth


In [12]:
pd.isnull(df)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False
3,False,True,True,False,False,True
4,False,True,True,True,True,False


In [13]:
pd.isnull(df['a'])

0    False
1    False
2    False
3     True
4     True
Name: a, dtype: bool

In [14]:
# The na_values option can take either a list or set of strings to consider 
# missing values:
# Missing data is usually either not present (empty string) or marked by
# some sentinel value. 
# By default, pandas uses a set of commonly occurring sentinels,
# such as NA and NULL:
df = pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\pandas_ex1.csv', 
                 na_values=['NULL'])
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [14]:
# Different NA sentinels can be specified for each column in a dict:
sentinels = {'message': ['foo', 'NA'], 'something': ['two', 'three']}
df = pd.read_csv('E:\\MYLEARN\\2-ANALYTICS\\datasets\\pandas_ex3.csv', 
                 na_values=sentinels, nrows=7)
df

FileNotFoundError: File b'E:\\MYLEARN\\2-ANALYTICS\\datasets\\pandas_ex3.csv' does not exist

In [15]:
# Data Cleaning and Preparation
from numpy import nan as NA
df = pd.DataFrame([[1., 6.5, 3.], 
                     [1.,NA ,NA ],
                     [NA,NA ,NA ], 
                     [NA, 6.5, 3.]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
# dropna by default drops any row containing a missing value:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
# Passing how='all' will only drop rows that are all NA:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-1.333158,2.521131,1.232129
1,-1.249939,0.308228,-0.452659
2,-0.153742,-1.464652,0.652946
3,-1.794587,0.521595,0.208709
4,-0.260902,0.059667,-1.540081
5,-1.871605,0.564977,0.619591
6,-0.705568,1.282251,0.989663


In [15]:
# upto first 3 rows and col  = 1
df.iloc[:4, 1] = NA
df

Unnamed: 0,0,1,2
0,-1.333158,,1.232129
1,-1.249939,,-0.452659
2,-0.153742,,0.652946
3,-1.794587,,0.208709
4,-0.260902,0.059667,-1.540081
5,-1.871605,0.564977,0.619591
6,-0.705568,1.282251,0.989663


In [16]:
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-1.333158,,
1,-1.249939,,
2,-0.153742,,0.652946
3,-1.794587,,0.208709
4,-0.260902,0.059667,-1.540081
5,-1.871605,0.564977,0.619591
6,-0.705568,1.282251,0.989663


In [17]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.260902,0.059667,-1.540081
5,-1.871605,0.564977,0.619591
6,-0.705568,1.282251,0.989663


In [18]:
# Filling In Missing Data
df
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.333158,0.0,0.0
1,-1.249939,0.0,0.0
2,-0.153742,0.0,0.652946
3,-1.794587,0.0,0.208709
4,-0.260902,0.059667,-1.540081
5,-1.871605,0.564977,0.619591
6,-0.705568,1.282251,0.989663


In [17]:
# fillna returns a new object, but you can modify the existing object in-place
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [54]:
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.847212,0.0,0.0
1,-0.187127,0.0,0.0
2,-0.13645,0.0,-0.728593
3,0.183284,0.0,-0.585722
4,-0.005329,-0.462178,0.819565
5,-0.359254,-0.326439,-0.539893
6,-0.655097,-0.844486,-0.059065


In [56]:
# interpolation methods
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,-0.353015,-0.315459,0.239995
1,1.674924,0.184732,-2.165036
2,1.30331,0.44369,-1.010638
3,0.850334,-0.415808,0.085867
4,-0.383253,-1.942323,0.511535
5,0.938564,-1.01619,0.647351


In [58]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.353015,-0.315459,0.239995
1,1.674924,0.184732,-2.165036
2,1.30331,,-1.010638
3,0.850334,,0.085867
4,-0.383253,,
5,0.938564,,


In [59]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.353015,-0.315459,0.239995
1,1.674924,0.184732,-2.165036
2,1.30331,0.184732,-1.010638
3,0.850334,0.184732,0.085867
4,-0.383253,0.184732,0.085867
5,0.938564,0.184732,0.085867


In [32]:
# Removing Duplicates
df = pd.DataFrame([[1,3],
                  [3,4],
                  [1,3],
                  [5,7],
                  [6,9],
                  [1,6],
                  [1,3]])
                
df

Unnamed: 0,0,1
0,1,3
1,3,4
2,1,3
3,5,7
4,6,9
5,1,6
6,1,3


In [33]:
df.duplicated()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
dtype: bool

In [34]:
# drop_duplicates returns a DataFrame where the duplicated array is False:
df.drop_duplicates()

Unnamed: 0,0,1
0,1,3
1,3,4
3,5,7
4,6,9
5,1,6


In [65]:
# duplicated and drop_duplicates by default keep the first observed value 
# combination.Passing keep='last' will return the last one:
df.drop_duplicates(keep='last')

Unnamed: 0,0,1
1,3,4
3,5,7
4,6,9
5,1,6
6,1,3
