***PANDAS TUTORIAL***

In [1]:
#series in pandas

In [2]:
import numpy as np
import pandas as pd

In [7]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

In [8]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [9]:
# in the above code we havent indexed our data, so by default is starts from 0 and so on

In [10]:
#we can define our series using our own indexes,
pd.Series(data = my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [11]:
#as we are entering details in order we can skip the key names
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [12]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [13]:
#dicitonaries are automatically indexed using the keys

In [14]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [15]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [16]:
#series can hold any kind of objects as its elements

In [17]:
ser1 = pd.Series([1,2,3,4],['a','b','c','d'])
ser2 = pd.Series([1,2,5,4],['a','b','e','d'])

In [18]:
#indexing series is same as indexing elements in a list, but instead of number, we us our defined indexes

In [19]:
ser1 + ser2

a    2.0
b    4.0
c    NaN
d    8.0
e    NaN
dtype: float64

In [21]:
#we observe a few things
#whereever the index couldnt find a match, the final was made as null
#and pandas automatically converted integers to floating point values,

**working with dataframes**

In [22]:
#part1

In [24]:
df = pd.DataFrame(np.random.rand(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,0.958645,0.163637,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961
D,0.383824,0.932538,0.850758,0.491059
E,0.168084,0.819629,0.649193,0.691899


In [25]:
#the above code is ised to gnenerate multiple collections of pandas series
#dataframe is a bunch of series sharing the same index 

In [28]:
df['W']
#this is a pandas series and it can also be reffered as
# dataframe.columnname

A    0.958645
B    0.747520
C    0.556018
D    0.383824
E    0.168084
Name: W, dtype: float64

In [29]:
df.W

A    0.958645
B    0.747520
C    0.556018
D    0.383824
E    0.168084
Name: W, dtype: float64

In [30]:
#this is not good, as it might confuse itself for functions

In [32]:
df[['W','Z']]
#a list is passed in it

Unnamed: 0,W,Z
A,0.958645,0.483112
B,0.74752,0.705384
C,0.556018,0.604961
D,0.383824,0.491059
E,0.168084,0.691899


In [34]:
#adding new columns
df['new'] = df['W'] + df['Z']
df

Unnamed: 0,W,X,Y,Z,new
A,0.958645,0.163637,0.60503,0.483112,1.441757
B,0.74752,0.850217,0.734625,0.705384,1.452904
C,0.556018,0.628938,0.21122,0.604961,1.160979
D,0.383824,0.932538,0.850758,0.491059,0.874883
E,0.168084,0.819629,0.649193,0.691899,0.859984


In [35]:
#to remove columns we use the .drop function
df.drop('new',axis =1)
#we must specify the axis to because as by default it is equal to zero 
#which means it is reffering to rows

Unnamed: 0,W,X,Y,Z
A,0.958645,0.163637,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961
D,0.383824,0.932538,0.850758,0.491059
E,0.168084,0.819629,0.649193,0.691899


In [37]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.958645,0.163637,0.60503,0.483112,1.441757
B,0.74752,0.850217,0.734625,0.705384,1.452904
C,0.556018,0.628938,0.21122,0.604961,1.160979
D,0.383824,0.932538,0.850758,0.491059,0.874883
E,0.168084,0.819629,0.649193,0.691899,0.859984


In [38]:
#we see that the original dataframe is not edited, this is done for us to not lose our data accidently

In [39]:
#to actually make it happen we set inplace to true
df.drop('new',axis = 1,inplace=True)

In [40]:
df

Unnamed: 0,W,X,Y,Z
A,0.958645,0.163637,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961
D,0.383824,0.932538,0.850758,0.491059
E,0.168084,0.819629,0.649193,0.691899


In [41]:
#to drop rows
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,0.958645,0.163637,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961
D,0.383824,0.932538,0.850758,0.491059


In [42]:
#SELECTING ROWS

In [43]:
df.loc['A']

W    0.958645
X    0.163637
Y    0.605030
Z    0.483112
Name: A, dtype: float64

In [44]:
#we see that even the rows are series

In [46]:
df.iloc[0]

W    0.958645
X    0.163637
Y    0.605030
Z    0.483112
Name: A, dtype: float64

In [47]:
#we can use numeric based index iloc to ask refer rows

In [48]:
df.loc['A','Z']

0.4831115972822283

In [49]:
df.loc[['A','C'],['W','Y']]

Unnamed: 0,W,Y
A,0.958645,0.60503
C,0.556018,0.21122


In [50]:
df.iloc[[0,2],[1,2]]

Unnamed: 0,X,Y
A,0.163637,0.60503
C,0.628938,0.21122


In [51]:
df.iloc[1:3,0:2]

Unnamed: 0,W,X
B,0.74752,0.850217
C,0.556018,0.628938


In [52]:
#part2

In [53]:
#conditional selection and multi-index


In [57]:
df > 0.2

Unnamed: 0,W,X,Y,Z
A,True,False,True,True
B,True,True,True,True
C,True,True,True,True
D,True,True,True,True
E,False,True,True,True


In [58]:
df[df>0.2]

Unnamed: 0,W,X,Y,Z
A,0.958645,,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961
D,0.383824,0.932538,0.850758,0.491059
E,,0.819629,0.649193,0.691899


In [59]:
#the above mentioned method is not used so frequently

In [60]:
#we usually filter out rows using the following method
df['W'] > 0.3

A     True
B     True
C     True
D     True
E    False
Name: W, dtype: bool

In [61]:
df[df['W']>0.3]

Unnamed: 0,W,X,Y,Z
A,0.958645,0.163637,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961
D,0.383824,0.932538,0.850758,0.491059


In [64]:
#the result is also a dataframe
#so we can say
resultdf = df[df['W']>0.4]
resultdf

Unnamed: 0,W,X,Y,Z
A,0.958645,0.163637,0.60503,0.483112
B,0.74752,0.850217,0.734625,0.705384
C,0.556018,0.628938,0.21122,0.604961


In [65]:
#now using result df we can also do dataframe operations on it and access the elements
resultdf['W']

A    0.958645
B    0.747520
C    0.556018
Name: W, dtype: float64

In [66]:
#this  is usually done in a single line

In [67]:
df[df['W']>0.4]['W']

A    0.958645
B    0.747520
C    0.556018
Name: W, dtype: float64

In [69]:
#to get multiple conditions satisfied
df = pd.DataFrame(np.random.rand(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,0.466177,0.708285,0.850859,0.841486
B,0.655583,0.150523,0.880893,0.65457
C,0.033201,0.426216,0.137307,0.264795
D,0.644548,0.772184,0.473036,0.905179
E,0.700485,0.299336,0.750651,0.66865


In [71]:
df[(df['W']>0.3) and (df['Z']>0.4)]
#this code will throw an error as the and operator can only use instances of boolean values and not
#series of boolean values

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [74]:
#so instead of using the and operator we use the & sign
df[(df['W']>0.3) & (df['Z']>0.5)]

Unnamed: 0,W,X,Y,Z
A,0.466177,0.708285,0.850859,0.841486
B,0.655583,0.150523,0.880893,0.65457
D,0.644548,0.772184,0.473036,0.905179
E,0.700485,0.299336,0.750651,0.66865


In [77]:
df[(df['W']>0.9) | (df['Z']<0.5)]

Unnamed: 0,W,X,Y,Z
C,0.033201,0.426216,0.137307,0.264795


In [78]:
#to reset index to a numerical value we can use the following function
#we should remember that this function is not inpalce

In [79]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.466177,0.708285,0.850859,0.841486
1,B,0.655583,0.150523,0.880893,0.65457
2,C,0.033201,0.426216,0.137307,0.264795
3,D,0.644548,0.772184,0.473036,0.905179
4,E,0.700485,0.299336,0.750651,0.66865


In [80]:
newind = 'KAR UP MP PUN AP'.split()

In [81]:
df['state'] = newind

In [82]:
df

Unnamed: 0,W,X,Y,Z,state
A,0.466177,0.708285,0.850859,0.841486,KAR
B,0.655583,0.150523,0.880893,0.65457,UP
C,0.033201,0.426216,0.137307,0.264795,MP
D,0.644548,0.772184,0.473036,0.905179,PUN
E,0.700485,0.299336,0.750651,0.66865,AP


In [83]:
df.set_index('state')

Unnamed: 0_level_0,W,X,Y,Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KAR,0.466177,0.708285,0.850859,0.841486
UP,0.655583,0.150523,0.880893,0.65457
MP,0.033201,0.426216,0.137307,0.264795
PUN,0.644548,0.772184,0.473036,0.905179
AP,0.700485,0.299336,0.750651,0.66865


In [84]:
#we should remember that if we set index as it is, it will oberwrite on the previous index
#and a new column will not be created

In [85]:
#again this is again not inplace so df still is same
df

Unnamed: 0,W,X,Y,Z,state
A,0.466177,0.708285,0.850859,0.841486,KAR
B,0.655583,0.150523,0.880893,0.65457,UP
C,0.033201,0.426216,0.137307,0.264795,MP
D,0.644548,0.772184,0.473036,0.905179,PUN
E,0.700485,0.299336,0.750651,0.66865,AP


In [86]:
df.set_index('state', inplace=True)

In [87]:
df

Unnamed: 0_level_0,W,X,Y,Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KAR,0.466177,0.708285,0.850859,0.841486
UP,0.655583,0.150523,0.880893,0.65457
MP,0.033201,0.426216,0.137307,0.264795
PUN,0.644548,0.772184,0.473036,0.905179
AP,0.700485,0.299336,0.750651,0.66865


In [88]:
#part3

In [89]:
#multiindex and index hierarchy