# PANDAS


### What are DataFrames and Series?
A Dataframe is A Two-dimensional, size-mutable, potentially heterogeneous tabular data.<br>
A Series is one diamensional labelled array capable of holding data of any type.

In [1]:
import pandas as pd
import numpy as np

s = pd.Series([1,2,3,4,5,6,np.nan,8,9])
s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    NaN
7    8.0
8    9.0
dtype: float64

In [26]:
d = pd.date_range('20200301',periods=6)
d

DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04',
               '2020-03-05', '2020-03-06'],
              dtype='datetime64[ns]', freq='D')

In [28]:
df = pd.DataFrame(np.random.randn(6,3),index=d,columns=['A','B','C'])
df

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-02,-1.674792,-0.722733,-1.041686
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074
2020-03-05,-1.342561,1.334975,-0.910149
2020-03-06,0.658922,0.808265,0.585609


In [22]:
# creating dataframe passing a dictionary
df1 = pd.DataFrame({
    'A':[1,2,3,4], 
    'B':pd.Timestamp('20200301'),
    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
    'D':np.array([5]*4,dtype='int32'),
    'E':pd.Categorical(['true','false','false','true']),
    'F':'Amitabh'
})
df1

Unnamed: 0,A,B,C,D,E,F
0,1,2020-03-01,1.0,5,True,Amitabh
1,2,2020-03-01,1.0,5,False,Amitabh
2,3,2020-03-01,1.0,5,False,Amitabh
3,4,2020-03-01,1.0,5,True,Amitabh


In [23]:
df.dtypes

A    float64
B    float64
C    float64
dtype: object

### View Data

In [29]:
df.head(3)

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-02,-1.674792,-0.722733,-1.041686
2020-03-03,1.612146,-0.541288,0.405522


In [30]:
df.tail(2)

Unnamed: 0,A,B,C
2020-03-05,-1.342561,1.334975,-0.910149
2020-03-06,0.658922,0.808265,0.585609


In [31]:
df.index

DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04',
               '2020-03-05', '2020-03-06'],
              dtype='datetime64[ns]', freq='D')

In [32]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [33]:
df.to_numpy()

array([[ 0.42683423, -0.9592859 , -0.613088  ],
       [-1.67479242, -0.72273338, -1.04168631],
       [ 1.6121459 , -0.54128812,  0.40552172],
       [ 1.63704148,  1.49002498, -0.09207375],
       [-1.34256084,  1.3349748 , -0.91014853],
       [ 0.65892155,  0.80826462,  0.58560918]])

In [35]:
df.describe()

Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,0.219598,0.234993,-0.277644
std,1.42922,1.100896,0.6845
min,-1.674792,-0.959286,-1.041686
25%,-0.900212,-0.677372,-0.835883
50%,0.542878,0.133488,-0.352581
75%,1.37384,1.203297,0.281123
max,1.637041,1.490025,0.585609


In [36]:
df.sort_index(axis=1,ascending=True)

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-02,-1.674792,-0.722733,-1.041686
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074
2020-03-05,-1.342561,1.334975,-0.910149
2020-03-06,0.658922,0.808265,0.585609


In [39]:
df.sort_values(by='B',ascending=True)

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-02,-1.674792,-0.722733,-1.041686
2020-03-03,1.612146,-0.541288,0.405522
2020-03-06,0.658922,0.808265,0.585609
2020-03-05,-1.342561,1.334975,-0.910149
2020-03-04,1.637041,1.490025,-0.092074


In [41]:
df['B']

2020-03-01   -0.959286
2020-03-02   -0.722733
2020-03-03   -0.541288
2020-03-04    1.490025
2020-03-05    1.334975
2020-03-06    0.808265
Freq: D, Name: B, dtype: float64

In [46]:
df[0:4]

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-02,-1.674792,-0.722733,-1.041686
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074


In [45]:
df[2:4]

Unnamed: 0,A,B,C
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074


In [47]:
df.loc[d[0]]

A    0.426834
B   -0.959286
C   -0.613088
Name: 2020-03-01 00:00:00, dtype: float64

In [53]:
df.loc[::2,['B','C']]# ::2 alternate rows, ['B','C'] columns

Unnamed: 0,B,C
2020-03-01,-0.959286,-0.613088
2020-03-03,-0.541288,0.405522
2020-03-05,1.334975,-0.910149


In [55]:
df.at[d[2],'C']

0.40552172395240743

In [58]:
# using positions
df.iloc[4]  # 5th row

A   -1.342561
B    1.334975
C   -0.910149
Name: 2020-03-05 00:00:00, dtype: float64

In [59]:
df.iloc[4,2]  # 5th row 3rd col

-0.9101485262141855

In [60]:
df.iloc[3:5,0:2]  # 5th row 3rd col

Unnamed: 0,A,B
2020-03-04,1.637041,1.490025
2020-03-05,-1.342561,1.334975


In [64]:
# where column A value is greater than 0
df[df['A']>0]

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074
2020-03-06,0.658922,0.808265,0.585609


In [65]:
df[df['A']>1]

Unnamed: 0,A,B,C
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074


### Handling Missing data 

In [70]:
df2 = df.reindex(index=d[0:6],columns=list(df.columns)+['D'])
df2.loc[d[0]:d[2],'D']=5
df2

Unnamed: 0,A,B,C,D
2020-03-01,0.426834,-0.959286,-0.613088,5.0
2020-03-02,-1.674792,-0.722733,-1.041686,5.0
2020-03-03,1.612146,-0.541288,0.405522,5.0
2020-03-04,1.637041,1.490025,-0.092074,
2020-03-05,-1.342561,1.334975,-0.910149,
2020-03-06,0.658922,0.808265,0.585609,


In [71]:
df2.isna() # df2.isnull()

Unnamed: 0,A,B,C,D
2020-03-01,False,False,False,False
2020-03-02,False,False,False,False
2020-03-03,False,False,False,False
2020-03-04,False,False,False,True
2020-03-05,False,False,False,True
2020-03-06,False,False,False,True


In [73]:
df2.isnull().sum()

A    0
B    0
C    0
D    3
dtype: int64

In [76]:
df2.fillna(value=2)

Unnamed: 0,A,B,C,D
2020-03-01,0.426834,-0.959286,-0.613088,5.0
2020-03-02,-1.674792,-0.722733,-1.041686,5.0
2020-03-03,1.612146,-0.541288,0.405522,5.0
2020-03-04,1.637041,1.490025,-0.092074,2.0
2020-03-05,-1.342561,1.334975,-0.910149,2.0
2020-03-06,0.658922,0.808265,0.585609,2.0


### Pandas Operations
#### 1. Descriptive statistics operations
#### 2. Applying functions to data
#### 3. String processing operations
#### 4. Histogramming


In [77]:
df.mean() #axis=0

A    0.219598
B    0.234993
C   -0.277644
dtype: float64

In [80]:
df.mean(1) # axis=1

2020-03-01   -0.381847
2020-03-02   -1.146404
2020-03-03    0.492127
2020-03-04    1.011664
2020-03-05   -0.305912
2020-03-06    0.684265
Freq: D, dtype: float64

In [83]:
s = pd.Series([1,2,3,np.nan,4,5], index=d).shift(2) #shift val 2 places. start from 3rd
s

2020-03-01    NaN
2020-03-02    NaN
2020-03-03    1.0
2020-03-04    2.0
2020-03-05    3.0
2020-03-06    NaN
Freq: D, dtype: float64

In [87]:
df.sub(s,axis='index')

Unnamed: 0,A,B,C
2020-03-01,,,
2020-03-02,,,
2020-03-03,0.612146,-1.541288,-0.594478
2020-03-04,-0.362959,-0.509975,-2.092074
2020-03-05,-4.342561,-1.665025,-3.910149
2020-03-06,,,


In [88]:
df

Unnamed: 0,A,B,C
2020-03-01,0.426834,-0.959286,-0.613088
2020-03-02,-1.674792,-0.722733,-1.041686
2020-03-03,1.612146,-0.541288,0.405522
2020-03-04,1.637041,1.490025,-0.092074
2020-03-05,-1.342561,1.334975,-0.910149
2020-03-06,0.658922,0.808265,0.585609


In [89]:
df.apply(np.absolute)

Unnamed: 0,A,B,C
2020-03-01,0.426834,0.959286,0.613088
2020-03-02,1.674792,0.722733,1.041686
2020-03-03,1.612146,0.541288,0.405522
2020-03-04,1.637041,1.490025,0.092074
2020-03-05,1.342561,1.334975,0.910149
2020-03-06,0.658922,0.808265,0.585609


### String functions

In [133]:

s=pd.Series(['amitabh','satish','pravir',np.nan,'football'])
x =s.str.upper()
x

0     AMITABH
1      SATISH
2      PRAVIR
3         NaN
4    FOOTBALL
dtype: object

### Split and Merge

In [93]:
df.apply(lambda x: x.max()-x.min())

A    3.311834
B    2.449311
C    1.627295
dtype: float64

In [95]:
ndf = pd.DataFrame(np.random.randn(10,4))
ndf

Unnamed: 0,0,1,2,3
0,0.099988,-0.277864,1.052681,-0.156324
1,0.640793,-0.508209,0.759076,1.047814
2,-0.379146,-1.097745,-0.439379,-0.849361
3,0.095308,-1.069948,0.016241,0.962191
4,-1.128252,-0.034631,-0.27691,-1.403493
5,1.468052,0.777492,0.866585,1.039143
6,-0.006366,0.5509,1.7649,1.529768
7,1.892595,1.178432,0.117499,1.165538
8,-0.901056,1.524994,0.970901,-0.621051
9,-1.808442,0.196605,1.483328,1.351339


In [99]:
ndf2=[ndf[:3],ndf[3:7],ndf[7:]]
ndf2

[          0         1         2         3
 0  0.099988 -0.277864  1.052681 -0.156324
 1  0.640793 -0.508209  0.759076  1.047814
 2 -0.379146 -1.097745 -0.439379 -0.849361,
           0         1         2         3
 3  0.095308 -1.069948  0.016241  0.962191
 4 -1.128252 -0.034631 -0.276910 -1.403493
 5  1.468052  0.777492  0.866585  1.039143
 6 -0.006366  0.550900  1.764900  1.529768,
           0         1         2         3
 7  1.892595  1.178432  0.117499  1.165538
 8 -0.901056  1.524994  0.970901 -0.621051
 9 -1.808442  0.196605  1.483328  1.351339]

In [100]:
pd.concat(ndf2)

Unnamed: 0,0,1,2,3
0,0.099988,-0.277864,1.052681,-0.156324
1,0.640793,-0.508209,0.759076,1.047814
2,-0.379146,-1.097745,-0.439379,-0.849361
3,0.095308,-1.069948,0.016241,0.962191
4,-1.128252,-0.034631,-0.27691,-1.403493
5,1.468052,0.777492,0.866585,1.039143
6,-0.006366,0.5509,1.7649,1.529768
7,1.892595,1.178432,0.117499,1.165538
8,-0.901056,1.524994,0.970901,-0.621051
9,-1.808442,0.196605,1.483328,1.351339


In [107]:
left = pd.DataFrame({'A':[1,2],'B':[3,4]})
left

Unnamed: 0,A,B
0,1,3
1,2,4


In [111]:
right = pd.DataFrame({'A':[4,2],'C':[6,5]})
right

Unnamed: 0,A,C
0,4,6
1,2,5


In [112]:
pd.merge(left,right, on='A')

Unnamed: 0,A,B,C
0,2,4,5


In [113]:
ndf

Unnamed: 0,0,1,2,3
0,0.099988,-0.277864,1.052681,-0.156324
1,0.640793,-0.508209,0.759076,1.047814
2,-0.379146,-1.097745,-0.439379,-0.849361
3,0.095308,-1.069948,0.016241,0.962191
4,-1.128252,-0.034631,-0.27691,-1.403493
5,1.468052,0.777492,0.866585,1.039143
6,-0.006366,0.5509,1.7649,1.529768
7,1.892595,1.178432,0.117499,1.165538
8,-0.901056,1.524994,0.970901,-0.621051
9,-1.808442,0.196605,1.483328,1.351339


### Group by

In [115]:
ndf.groupby(2).sum()

Unnamed: 0_level_0,0,1,3
2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.439379,-0.379146,-1.097745,-0.849361
-0.27691,-1.128252,-0.034631,-1.403493
0.016241,0.095308,-1.069948,0.962191
0.117499,1.892595,1.178432,1.165538
0.759076,0.640793,-0.508209,1.047814
0.866585,1.468052,0.777492,1.039143
0.970901,-0.901056,1.524994,-0.621051
1.052681,0.099988,-0.277864,-0.156324
1.483328,-1.808442,0.196605,1.351339
1.7649,-0.006366,0.5509,1.529768


In [117]:
ndf.groupby([2,3]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
2,3,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.439379,-0.849361,-0.379146,-1.097745
-0.27691,-1.403493,-1.128252,-0.034631
0.016241,0.962191,0.095308,-1.069948
0.117499,1.165538,1.892595,1.178432
0.759076,1.047814,0.640793,-0.508209
0.866585,1.039143,1.468052,0.777492
0.970901,-0.621051,-0.901056,1.524994
1.052681,-0.156324,0.099988,-0.277864
1.483328,1.351339,-1.808442,0.196605
1.7649,1.529768,-0.006366,0.5509


### Stack and pivot table 

In [143]:
a =ndf.stack()
a.head(6)

0  0    0.099988
   1   -0.277864
   2    1.052681
   3   -0.156324
1  0    0.640793
   1   -0.508209
dtype: float64

In [145]:
a.unstack().head(2)

Unnamed: 0,0,1,2,3
0,0.099988,-0.277864,1.052681,-0.156324
1,0.640793,-0.508209,0.759076,1.047814


In [150]:
pd.pivot_table(ndf,values=2,index=[1,2,3]).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
1,2,3,Unnamed: 3_level_1
-1.097745,-0.439379,-0.849361,-0.379146
-1.069948,0.016241,0.962191,0.095308
-0.508209,0.759076,1.047814,0.640793


### Time Series and Categorical Data

In [156]:
dates = pd.date_range('1/1/2023',periods=100,freq='S')
ts = pd.Series(np.random.randint(0,500,len(dates)),dates)
ts

2023-01-01 00:00:00     28
2023-01-01 00:00:01    447
2023-01-01 00:00:02    420
2023-01-01 00:00:03    212
2023-01-01 00:00:04    128
                      ... 
2023-01-01 00:01:35     80
2023-01-01 00:01:36    461
2023-01-01 00:01:37     60
2023-01-01 00:01:38    198
2023-01-01 00:01:39     48
Freq: S, Length: 100, dtype: int64

In [154]:
ts.resample('5min').sum()

2023-01-01    25980
Freq: 5T, dtype: int64

In [166]:
ts_utc = ts.tz_localize('UTC')
print(ts_utc.head())
conv = ts_utc.tz_convert('Asia/Kolkata')
print(conv.head())


2023-01-01 00:00:00+00:00     28
2023-01-01 00:00:01+00:00    447
2023-01-01 00:00:02+00:00    420
2023-01-01 00:00:03+00:00    212
2023-01-01 00:00:04+00:00    128
Freq: S, dtype: int64
2023-01-01 05:30:00+05:30     28
2023-01-01 05:30:01+05:30    447
2023-01-01 05:30:02+05:30    420
2023-01-01 05:30:03+05:30    212
2023-01-01 05:30:04+05:30    128
Freq: S, dtype: int64


In [178]:
# Categorical

cdf = pd.DataFrame({'id':[1,2,3,4,5,6],
                   "grade":['F','E','D','C','B','A']
                   })
cdf

Unnamed: 0,id,grade
0,1,F
1,2,E
2,3,D
3,4,C
4,5,B
5,6,A


In [182]:
cdf["Grade"] = cdf["grade"].astype("category")
cdf["Grade"] = cdf["grade"].categories=["very bad","bad","medium","good","very good","excellent"]
cdf

Unnamed: 0,id,grade,Grade
0,1,F,very bad
1,2,E,bad
2,3,D,medium
3,4,C,good
4,5,B,very good
5,6,A,excellent
