In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
data = [1, 1.2, "string", True]
s = pd.Series(data)

In [3]:
s

0         1
1       1.2
2    string
3      True
dtype: object

In [4]:
s1 = pd.Series([1.2, 3.4, 5.6, 7.8, 13.5])
s1

0     1.2
1     3.4
2     5.6
3     7.8
4    13.5
dtype: float64

In [5]:
pd.Series?

In [6]:
data = np.random.normal(5, 2.5, size=26)
letters = list("abcdefghijklmnopqrstuvwxyz")

s2 = pd.Series(data, index=letters, name="example", dtype=np.float16)
s2

a     5.281250
b     5.878906
c     8.382812
d     1.741211
e     4.519531
f     8.976562
g     7.488281
h     6.320312
i    10.328125
j     3.279297
k    -2.246094
l     5.300781
m     1.142578
n     5.753906
o     8.945312
p     6.519531
q     5.453125
r     2.876953
s     5.914062
t     6.746094
u     6.027344
v     4.117188
w     5.566406
x     6.898438
y     6.257812
z     5.394531
Name: example, dtype: float16

In [7]:
s2['v']

4.117

In [8]:
s2['c':'n']

c     8.382812
d     1.741211
e     4.519531
f     8.976562
g     7.488281
h     6.320312
i    10.328125
j     3.279297
k    -2.246094
l     5.300781
m     1.142578
n     5.753906
Name: example, dtype: float16

In [9]:
s2 < 0

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
i    False
j    False
k     True
l    False
m    False
n    False
o    False
p    False
q    False
r    False
s    False
t    False
u    False
v    False
w    False
x    False
y    False
z    False
Name: example, dtype: bool

In [10]:
s2[s2 < 0]

k   -2.246094
Name: example, dtype: float16

In [17]:
s3 = np.exp(s2)
s3

a      196.625000
b      357.500000
c     4372.000000
d        5.703125
e       91.812500
f     7916.000000
g     1787.000000
h      555.500000
i    30576.000000
j       26.562500
k        0.105835
l      200.500000
m        3.134766
n      315.500000
o     7672.000000
p      678.500000
q      233.500000
r       17.765625
s      370.250000
t      850.500000
u      414.500000
v       61.375000
w      261.500000
x      990.500000
y      522.000000
z      220.250000
Name: example, dtype: float16

In [12]:
'a' in s2

True

In [13]:
'A' in s2

False

In [18]:
s3['x']

990.5

In [19]:
s3 == 990.5

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
i    False
j    False
k    False
l    False
m    False
n    False
o    False
p    False
q    False
r    False
s    False
t    False
u    False
v    False
w    False
x     True
y    False
z    False
Name: example, dtype: bool

In [20]:
s3[s3 == 990.5]

x    990.5
Name: example, dtype: float16

## DataFrame

In [21]:
data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Virginia", "Virginia"],
    "year": [2000, 2001, 2002, 2000, 2001, 2001, 2002],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 4.5]
}

df = pd.DataFrame(data)

In [22]:
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2000,2.4
4,Nevada,2001,2.9
5,Virginia,2001,3.2
6,Virginia,2002,4.5


In [24]:
df.head(n=10)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2000,2.4
4,Nevada,2001,2.9
5,Virginia,2001,3.2
6,Virginia,2002,4.5


In [25]:
df.tail()

Unnamed: 0,state,year,pop
2,Ohio,2002,3.6
3,Nevada,2000,2.4
4,Nevada,2001,2.9
5,Virginia,2001,3.2
6,Virginia,2002,4.5


In [26]:
df.tail(n=3)

Unnamed: 0,state,year,pop
4,Nevada,2001,2.9
5,Virginia,2001,3.2
6,Virginia,2002,4.5


In [28]:
df["pop"]

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
6    4.5
Name: pop, dtype: float64

In [30]:
df.loc[1, "state"]

'Ohio'

In [31]:
df.loc[1:6, ["state", "pop"]]

Unnamed: 0,state,pop
1,Ohio,1.7
2,Ohio,3.6
3,Nevada,2.4
4,Nevada,2.9
5,Virginia,3.2
6,Virginia,4.5


In [33]:
df["state"] == "Ohio"

0     True
1     True
2     True
3    False
4    False
5    False
6    False
Name: state, dtype: bool

In [34]:
df.loc[df["state"] == "Ohio"]

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [39]:
df.loc[df["pop"] < 3]

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
3,Nevada,2000,2.4
4,Nevada,2001,2.9


In [40]:
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2000,2.4
4,Nevada,2001,2.9
5,Virginia,2001,3.2
6,Virginia,2002,4.5


In [41]:
df["debt"] = pd.NA

In [42]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
3,Nevada,2000,2.4,
4,Nevada,2001,2.9,
5,Virginia,2001,3.2,
6,Virginia,2002,4.5,


In [43]:
df["debt"] = [.34, 6.5, 568.56, 4235.5, 32.5, 6.7, 7.0]

In [44]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


## Index Objects

In [45]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


In [46]:
df2 = df.set_index(["state", "year"])

In [47]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,pop,debt
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,2000,1.5,0.34
Ohio,2001,1.7,6.5
Ohio,2002,3.6,568.56
Nevada,2000,2.4,4235.5
Nevada,2001,2.9,32.5
Virginia,2001,3.2,6.7
Virginia,2002,4.5,7.0


In [48]:
df2.loc["Ohio"]

Unnamed: 0_level_0,pop,debt
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,0.34
2001,1.7,6.5
2002,3.6,568.56


In [49]:
df2.loc[("Ohio", 2001)]

pop     1.7
debt    6.5
Name: (Ohio, 2001), dtype: float64

In [50]:
df2.loc[:, "pop"]

state     year
Ohio      2000    1.5
          2001    1.7
          2002    3.6
Nevada    2000    2.4
          2001    2.9
Virginia  2001    3.2
          2002    4.5
Name: pop, dtype: float64

In [51]:
df2.loc[:, ["pop"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,pop
state,year,Unnamed: 2_level_1
Ohio,2000,1.5
Ohio,2001,1.7
Ohio,2002,3.6
Nevada,2000,2.4
Nevada,2001,2.9
Virginia,2001,3.2
Virginia,2002,4.5


## Essential functionality

In [52]:
s4 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
s4

0      blue
2    purple
4    yellow
dtype: object

In [53]:
new_index = np.arange(6)
s4.reindex(new_index, method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [54]:
s4.reindex?

In [56]:
s4.reindex(new_index, method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [57]:
s4.reindex(new_index, method="bfill")

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [58]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


In [59]:
df.drop(2)

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


In [60]:
df.drop([3,4,6])

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
5,Virginia,2001,3.2,6.7


In [61]:
df.drop("state", axis=1)

Unnamed: 0,year,pop,debt
0,2000,1.5,0.34
1,2001,1.7,6.5
2,2002,3.6,568.56
3,2000,2.4,4235.5
4,2001,2.9,32.5
5,2001,3.2,6.7
6,2002,4.5,7.0


In [62]:
df.drop(["state", "pop"], axis=1)

Unnamed: 0,year,debt
0,2000,0.34
1,2001,6.5
2,2002,568.56
3,2000,4235.5
4,2001,32.5
5,2001,6.7
6,2002,7.0


In [64]:
s5 = pd.Series([1, 2, 3, 4], index=['a', 'c', 'd', 'e'])
s6 = pd.Series([10, 20, 30, 40, 50], index=['a', 'c', 'e', 'f', 'g'])

In [65]:
s5 + s6

a    11.0
c    22.0
d     NaN
e    34.0
f     NaN
g     NaN
dtype: float64

In [66]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


In [67]:
df.describe() # very important!

Unnamed: 0,year,pop,debt
count,7.0,7.0,7.0
mean,2001.0,2.828571,693.871429
std,0.816497,1.060997,1575.526913
min,2000.0,1.5,0.34
25%,2000.5,2.05,6.6
50%,2001.0,2.9,7.0
75%,2001.5,3.4,300.53
max,2002.0,4.5,4235.5


In [70]:
df.set_index(["state", "year"]).describe()
  

Unnamed: 0,pop,debt
count,7.0,7.0
mean,2.828571,693.871429
std,1.060997,1575.526913
min,1.5,0.34
25%,2.05,6.6
50%,2.9,7.0
75%,3.4,300.53
max,4.5,4235.5


In [71]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


In [76]:
df.loc[:, ["state", "pop"]].groupby(["state"]).describe()

Unnamed: 0_level_0,pop,pop,pop,pop,pop,pop,pop,pop
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Nevada,2.0,2.65,0.353553,2.4,2.525,2.65,2.775,2.9
Ohio,3.0,2.266667,1.159023,1.5,1.6,1.7,2.65,3.6
Virginia,2.0,3.85,0.919239,3.2,3.525,3.85,4.175,4.5


In [77]:
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.34
1,Ohio,2001,1.7,6.5
2,Ohio,2002,3.6,568.56
3,Nevada,2000,2.4,4235.5
4,Nevada,2001,2.9,32.5
5,Virginia,2001,3.2,6.7
6,Virginia,2002,4.5,7.0


In [78]:
df1 = df.set_index(["state", "year"])

In [79]:
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,pop,debt
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,2000,1.5,0.34
Ohio,2001,1.7,6.5
Ohio,2002,3.6,568.56
Nevada,2000,2.4,4235.5
Nevada,2001,2.9,32.5
Virginia,2001,3.2,6.7
Virginia,2002,4.5,7.0


In [81]:
df1.loc["Nevada", :]

Unnamed: 0_level_0,pop,debt
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,2.4,4235.5
2001,2.9,32.5
