In [3]:
import numpy as np
import pandas as pd

### Series

In [4]:
sr = pd.Series([4,2,-1,2.0]) # Sequence objects like array, datatypes are same as numpy
sr

0    4.0
1    2.0
2   -1.0
3    2.0
dtype: float64

In [5]:
sr.array

<NumpyExtensionArray>
[np.float64(4.0), np.float64(2.0), np.float64(-1.0), np.float64(2.0)]
Length: 4, dtype: float64

In [6]:
sr.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
ser = pd.Series([1,2,6,3], index=['a','b','c','d']) # you can specify the index or pass a dictionary
ser[['a','c']] = 4,6 # choose multiple non neighbouring elements
ser

a    4
b    2
c    6
d    3
dtype: int64

In [8]:
dic = ser.to_dict() # can be converted to dictionary
dic

{'a': 4, 'b': 2, 'c': 6, 'd': 3}

In [9]:
seri = pd.Series(dic, ['c','d','a','e']) # can convert a dictionary to a Series and specify order of elements
seri # missing keys in the dictionary will have value of NaN

c    6.0
d    3.0
a    4.0
e    NaN
dtype: float64

In [10]:
pd.isna(seri) # can be used to detect NaN (missing data)

c    False
d    False
a    False
e     True
dtype: bool

In [11]:
seri.index = ['x','y','z','f']
seri

x    6.0
y    3.0
z    4.0
f    NaN
dtype: float64

### Dataframe

In [12]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data) # table of data with rows and cols, can optionally specify order of cols (data,columns=[...])
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [13]:
frame.head() # select first 5 rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [14]:
frame.tail() # select last 5 rows

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [15]:
print(frame.state, end="\n\n") # can retrieve certain column with attribute like syntax or dictionary like syntax
print(frame["state"])

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object


In [16]:
frame.loc[1] # retrieve rows

state    Ohio
year     2001
pop       1.7
Name: 1, dtype: object

In [17]:
frame["eastern"] = frame["state"] == "Ohio" # if the column doesnt exist it will be added
frame

Unnamed: 0,state,year,pop,eastern
0,Ohio,2000,1.5,True
1,Ohio,2001,1.7,True
2,Ohio,2002,3.6,True
3,Nevada,2001,2.4,False
4,Nevada,2002,2.9,False
5,Nevada,2003,3.2,False


In [18]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", 1]) 
frame["debt"] = val # missing indices will be NaN
frame

Unnamed: 0,state,year,pop,eastern,debt
0,Ohio,2000,1.5,True,
1,Ohio,2001,1.7,True,-1.7
2,Ohio,2002,3.6,True,
3,Nevada,2001,2.4,False,
4,Nevada,2002,2.9,False,
5,Nevada,2003,3.2,False,


In [19]:
del frame["debt"] # delete a column

In [20]:
frame

Unnamed: 0,state,year,pop,eastern
0,Ohio,2000,1.5,True
1,Ohio,2001,1.7,True
2,Ohio,2002,3.6,True
3,Nevada,2001,2.4,False
4,Nevada,2002,2.9,False
5,Nevada,2003,3.2,False


In [21]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},"Nevada": {2001: 2.4, 2002: 2.9}} # in nested dictionaries outer dictionary keys will be
# treated as columns, and inner as row indices
frame2 = pd.DataFrame(populations)
frame2

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


### Index Objects

In [27]:
frame.index # Index objs are immutable array-like that can contain duplicates

RangeIndex(start=0, stop=6, step=1)

In [26]:
5 in frame.index

True

In [40]:
ser2 = pd.Series(np.arange(7), frame.index.append(pd.Index([17])))

In [41]:
ser2

0     0
1     1
2     2
3     3
4     4
5     5
17    6
dtype: int64

In [66]:
frame2 = pd.DataFrame(np.arange(12).reshape(3,4), columns=["first", "second", "Third", "Fourth"])
frame2

Unnamed: 0,first,second,Third,Fourth
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [77]:
frame3 = frame2.reindex(index=[0,2,3,4,5]) # row 1 is dropped because its non existant in new indexing, can also reindex column
frame3

Unnamed: 0,first,second,Third,Fourth
0,0.0,1.0,2.0,3.0
2,8.0,9.0,10.0,11.0
3,,,,
4,,,,
5,,,,


### Dropping Elements

In [82]:
frame3.drop(index=[3,5], columns=["second"])

Unnamed: 0,first,Third,Fourth
0,0.0,2.0,3.0
2,8.0,10.0,11.0
4,,,


### Indexing

In [153]:
frame4 = pd.DataFrame(np.arange(16).reshape(4,4), index=[chr(x+ord('a')) for x in range(4)])
frame4

Unnamed: 0,0,1,2,3
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [135]:
frame4.loc['a':'c']

Unnamed: 0,0,1,2,3
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [147]:
frame4.loc[['a', 'b', 'd']]

Unnamed: 0,0,1,2,3
a,0,1,2,3
b,4,5,6,7
d,12,13,14,15


In [150]:
frame4.iloc[[0,3]] # works as order if the indices are not int

Unnamed: 0,0,1,2,3
a,0,1,2,3
d,12,13,14,15


In [168]:
frame4.iloc[:2, [2,3]]

Unnamed: 0,2,3
a,2,3
b,6,7


In [185]:
frame4.iloc[:, [2,3]][frame4[2]<14] # you can chain indexing

Unnamed: 0,2,3
a,2,3
b,6,7
c,10,11


### Arithmetic Operations

In [186]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])

In [187]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [188]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [190]:
s1 + s2 # missing indices in each are NaN

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [192]:
s1.add(s2, fill_value=0) # arthematic methods have fill_value attributes to handle missing data

a    5.2
c    1.1
d    3.4
e    0.0
f    4.0
g    3.1
dtype: float64

### Sorting

In [193]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [198]:
frame_ = pd.DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=["d", "a", "b", "c"])
frame_

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [199]:
frame_.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [204]:
frame_.sort_index(axis="columns") # ascending=false attribute to sort descendingly

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [215]:
frame_.sort_values('b') # Sort values by a one or more column

Unnamed: 0,d,a,b,c
three,5,1,2,3
one,4,5,6,7


### Some Statistical Methods

In [216]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
index=["a", "b", "c", "d"], columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [219]:
df.sum() # return series of sums of each column

one    9.25
two   -5.80
dtype: float64

In [223]:
df.sum(axis=1) # same but with rows

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [224]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64