# Pandas

In [16]:
import numpy as np

In [18]:
import pandas as pd

In [20]:
labels = ['a','b','c']
my_data = [1,2,3]
d = {'a': 1, 'b': 2, 'c': 3}

In [22]:
arr = np.array(my_data)

In [28]:
arr

array([1, 2, 3])

# Series

In [26]:
pd.Series(data = my_data)

0    1
1    2
2    3
dtype: int64

In [31]:
pd.Series(data = my_data, index = labels)

a    1
b    2
c    3
dtype: int64

In [33]:
pd.Series(d)

a    1
b    2
c    3
dtype: int64

In [35]:
d

{'a': 1, 'b': 2, 'c': 3}

In [37]:
labels

['a', 'b', 'c']

In [41]:
pd.Series(data = [sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [51]:
sr1 = pd.Series([1,2,3],['USA','Italy','India'])

In [53]:
sr1

USA      1
Italy    2
India    3
dtype: int64

In [55]:
sr2 = pd.Series([1,2,3],['USA','USSR','India'])

In [57]:
sr2

USA      1
USSR     2
India    3
dtype: int64

In [59]:
sr1 + sr2

India    6.0
Italy    NaN
USA      2.0
USSR     NaN
dtype: float64

# DataFrames

In [62]:
from numpy.random import randn

In [64]:
np.random.seed(101)

In [66]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [68]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [70]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [72]:
type(df['W'])

pandas.core.series.Series

In [74]:
type(df)

pandas.core.frame.DataFrame

In [76]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [80]:
df['new'] = df['W'] + df['Z']

In [82]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


In [86]:
df.drop('new', axis = 1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [88]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


In [90]:
df.drop('new', axis = 1, inplace = True)

In [92]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


# Row

In [97]:
df.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [99]:
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [101]:
df.loc['C','Y']

0.5288134940893595

In [105]:
df.loc[['A', 'B'], ['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965


# Conditional

In [108]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [110]:
booldf = df > 0

In [112]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [114]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [116]:
booldf[df>0]

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,,,True
C,,True,True,
D,True,,,True
E,True,True,True,True


In [118]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [120]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [122]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [124]:
resultdf = df[df['W'] > 0]

In [126]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [128]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [130]:
df[df['W'] > 0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [132]:
df[df['W'] > 0][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


In [134]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [136]:
df[(df['W'] > 0) & (df['Y'] <=1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [138]:
df[(df['W'] > 0) & (df['Y'] >1)] # python 'and' is not applicable, '|' for or

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [140]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [142]:
newid = 'IND USA RUS BAN PAK'.split()

In [144]:
newid

['IND', 'USA', 'RUS', 'BAN', 'PAK']

In [146]:
df['States'] = newid

In [148]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,IND
B,0.651118,-0.319318,-0.848077,0.605965,USA
C,-2.018168,0.740122,0.528813,-0.589001,RUS
D,0.188695,-0.758872,-0.933237,0.955057,BAN
E,0.190794,1.978757,2.605967,0.683509,PAK


In [150]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IND,2.70685,0.628133,0.907969,0.503826
USA,0.651118,-0.319318,-0.848077,0.605965
RUS,-2.018168,0.740122,0.528813,-0.589001
BAN,0.188695,-0.758872,-0.933237,0.955057
PAK,0.190794,1.978757,2.605967,0.683509


In [152]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,IND
B,0.651118,-0.319318,-0.848077,0.605965,USA
C,-2.018168,0.740122,0.528813,-0.589001,RUS
D,0.188695,-0.758872,-0.933237,0.955057,BAN
E,0.190794,1.978757,2.605967,0.683509,PAK


In [154]:
# Didn't change as inplace = false by default

In [158]:
outside = ['G1','G1', 'G1', 'G2', 'G2', 'G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [160]:
outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [162]:
inside

[1, 2, 3, 1, 2, 3]

In [170]:
hier_index = list(zip(outside, inside))
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [172]:
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [180]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])

In [176]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [186]:
df.index.names = ['Groups', 'Num']

In [188]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-1.136645,0.000366
G1,2,1.025984,-0.156598
G1,3,-0.031579,0.649826
G2,1,2.154846,-0.610259
G2,2,-0.755325,-0.346419
G2,3,0.147027,-0.479448


In [198]:
df.loc['G1']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.136645,0.000366
2,1.025984,-0.156598
3,-0.031579,0.649826


In [200]:
df.loc['G1'].loc[2]

A    1.025984
B   -0.156598
Name: 2, dtype: float64