In [20]:
# pandas Series is a one-dimensional array-like object containing a sequence of values of the same type and an associated array of data labels called index
import pandas as pd
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [21]:
# We can get array and index from Series as follows:
obj.array, obj.index

(<NumpyExtensionArray>
 [4, 7, -5, 3]
 Length: 4, dtype: int64,
 RangeIndex(start=0, stop=4, step=1))

In [22]:
# We can specify index when creating series
obj2 = pd.Series([4,7,-5,3], index=["d", "b", "a", "c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [23]:
# We can select multiple valuse from Series by index
obj2[["c", "a", "d"]]

c    3
a   -5
d    4
dtype: int64

In [24]:
# Numpy functions or numpy like operations can be done on the series and it will preserve the index
import numpy as np

np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [25]:
# Another way to think about series objects is as a fix length, ordered dictonary of index values to data values:
"b" in obj2, "e" in obj2

(True, False)

In [26]:
# Series can be created from Python dictionary
# Series can be converted back to dict with to_dict method
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

obj3 = pd.Series(sdata)
obj3, obj3.to_dict()

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000})

In [27]:
# By default the index order in series will respect the order in the keys accordinf to the dictionary keys method which takes keys as they were inserted. This can be overwritten. 
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [28]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [29]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [30]:
# Series have above methods also as class methods
obj4.isna(), obj4.notna()

(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California    False
 Ohio           True
 Oregon         True
 Texas          True
 dtype: bool)

In [31]:
# Series automatically aligns in arithmetic operations by index.
# This means if for example you add a series to a series the operation will happen on elements with same index
obj3, obj4, obj3+obj4

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 California        NaN
 Ohio          35000.0
 Oregon        16000.0
 Texas         71000.0
 dtype: float64,
 California         NaN
 Ohio           70000.0
 Oregon         32000.0
 Texas         142000.0
 Utah               NaN
 dtype: float64)

In [32]:
# A Series index can be assigned 
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
obj.name = "People"
obj.index.name = "People names"
obj

People names
Bob      4
Steve    7
Jeff    -5
Ryan     3
Name: People, dtype: int64

## Data Frame
Data frame represents rectangular table of data and contains ordered named collection of colums, each of which can be a different data type. The data frame has both row and column indexes. It can thought of as a dictionary of Series all sharing the same index.

In [33]:
# There are manu ways to construct a data frama but one of the most common ones is from a dictionary of equal length lists or numpy arrays.
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [34]:
# Ther are methods
frame.head(), frame.tail()

(    state  year  pop
 0    Ohio  2000  1.5
 1    Ohio  2001  1.7
 2    Ohio  2002  3.6
 3  Nevada  2001  2.4
 4  Nevada  2002  2.9,
     state  year  pop
 1    Ohio  2001  1.7
 2    Ohio  2002  3.6
 3  Nevada  2001  2.4
 4  Nevada  2002  2.9
 5  Nevada  2003  3.2)

In [35]:
# If sequence of columns is specified in the constructor the data frame columns would be ordered as specified
frame = pd.DataFrame(data, columns=["year", "state", "pop"])
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [36]:
# If a coulumn specified in the columns param of the constructior is not in the map the data for the column in data frame will be NaN
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "dept", "ppl"])
frame2, frame2.columns

(   year   state  pop dept  ppl
 0  2000    Ohio  1.5  NaN  NaN
 1  2001    Ohio  1.7  NaN  NaN
 2  2002    Ohio  3.6  NaN  NaN
 3  2001  Nevada  2.4  NaN  NaN
 4  2002  Nevada  2.9  NaN  NaN
 5  2003  Nevada  3.2  NaN  NaN,
 Index(['year', 'state', 'pop', 'dept', 'ppl'], dtype='object'))

In [37]:
# Columns can be modified by assignment
frame2["dept"] = np.arange(6.)
frame2["ppl"] = 10000
frame2

Unnamed: 0,year,state,pop,dept,ppl
0,2000,Ohio,1.5,0.0,10000
1,2001,Ohio,1.7,1.0,10000
2,2002,Ohio,3.6,2.0,10000
3,2001,Nevada,2.4,3.0,10000
4,2002,Nevada,2.9,4.0,10000
5,2003,Nevada,3.2,5.0,10000


In [38]:
# List of values must mach the length of the DataFrame to assign it to the DataFrame. If Series is assigned to DataFrame column the elements will be alligned by DataFrame index.

val = pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
frame2["dept"] = val

frame2

Unnamed: 0,year,state,pop,dept,ppl
0,2000,Ohio,1.5,,10000
1,2001,Ohio,1.7,,10000
2,2002,Ohio,3.6,-1.2,10000
3,2001,Nevada,2.4,,10000
4,2002,Nevada,2.9,-1.5,10000
5,2003,Nevada,3.2,-1.7,10000


In [39]:
# Adding a columns
frame2["eastern"] = frame2["state"] == "Ohio"
frame2

Unnamed: 0,year,state,pop,dept,ppl,eastern
0,2000,Ohio,1.5,,10000,True
1,2001,Ohio,1.7,,10000,True
2,2002,Ohio,3.6,-1.2,10000,True
3,2001,Nevada,2.4,,10000,False
4,2002,Nevada,2.9,-1.5,10000,False
5,2003,Nevada,3.2,-1.7,10000,False


In [40]:
# Method del can be used to remove the columns
del frame2["eastern"]
frame2.columns

Index(['year', 'state', 'pop', 'dept', 'ppl'], dtype='object')

In [41]:
# If the nested dic is passed to the constructor, outer keys are regarded as columns and inner keys as row indices:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},"Nevada": {2001: 2.4, 2002: 2.9}}
frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [42]:
# DataFrame can be transposed, swap rows and columns in the similar way as numpy arrays
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [43]:
# If index is specified then the innder dict keys are not used 
pd.DataFrame(populations, index = [2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [44]:
pdata = {"Ohio": frame3["Ohio"][:-1], "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [45]:
# DataFrame method to_numpy creates two-dim numpy array
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [46]:
# If the values in the DataFrame are of different types, the type in numpy array will choosen to accomodate all.
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, nan, 10000],
       [2001, 'Ohio', 1.7, nan, 10000],
       [2002, 'Ohio', 3.6, -1.2, 10000],
       [2001, 'Nevada', 2.4, nan, 10000],
       [2002, 'Nevada', 2.9, -1.5, 10000],
       [2003, 'Nevada', 3.2, -1.7, 10000]], dtype=object)

## Index Objects

In [47]:
# Index objects are imutable
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index[1] = "d"  # TypeError

NameError: name 'index' is not defined

In [48]:
# In addition to be array like, index also behaves as a fixed size set
frame3.columns, "Ohio" in frame3.columns, 2003 in frame3.index

(Index(['Ohio', 'Nevada'], dtype='object'), True, False)

In [49]:
# Unlike python sets, pandas indexes can contain duplicate values
pd.Index(["foo", "foo", "bar", "bar"])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## Essential Functionality
### Reindexing

Reindexing means creating new pandas object with values rearanged to allign with the new index.

In [50]:
# For example:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [51]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [52]:
# When rearanging to fill the missing data there is a method option of reindex. ffill forward fills the missing values.
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3.reindex(np.arange(6), method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [53]:
# With DataFrame reindex can alter the row(index), columns or both. When passed only a sequence it reindexes the rows in the result:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "c", "d"], columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [54]:
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [55]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [56]:
# Another way to reindex a particular axis is to pass new index as a positional argument and axis as a named axis argument
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [57]:
# You can also reindex by using loc. This works only if all the new index values already exist in data frame
frame.loc[['a','d','c'], ['California', 'Texas']]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


## Dropping Entries from an Axis

In [58]:
obj = pd.Series(np.arange(5.), index=["a","b","c","d","e"])
obj2 = obj.drop("c")
# Drop returns new object
obj, obj2

(a    0.0
 b    1.0
 c    2.0
 d    3.0
 e    4.0
 dtype: float64,
 a    0.0
 b    1.0
 d    3.0
 e    4.0
 dtype: float64)

In [59]:
# With data frame index values can be deleted from either axis
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [60]:
data.drop(index=["Colorado","Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [61]:
data.drop(columns=["two"])

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [62]:
# Data drop can also be done by passing axis option like in numpy arrays
data.drop("two", axis=1), data.drop(["two", "four"], axis="columns")

(          one  three  four
 Ohio        0      2     3
 Colorado    4      6     7
 Utah        8     10    11
 New York   12     14    15,
           one  three
 Ohio        0      2
 Colorado    4      6
 Utah        8     10
 New York   12     14)

## Indexing, Selection and Filtering

In [63]:
# Series indexing works analogously to numpy array indexing except that we can use series index values instead only integers
obj = pd.Series(np.arange(4), index=["a","b","c","d"])
obj, obj["b"], obj[2:4], obj.iloc[1], obj.iloc[2:4], obj[["b", "a", "d"]], obj.iloc[[1,3]]

(a    0
 b    1
 c    2
 d    3
 dtype: int64,
 1,
 c    2
 d    3
 dtype: int64,
 1,
 c    2
 d    3
 dtype: int64,
 b    1
 a    0
 d    3
 dtype: int64,
 b    1
 d    3
 dtype: int64)

In [64]:
# Prefered way is to use loc anc iloc instead of [index label] or [index number]
obj.loc[["b", "a", "d"]], obj.iloc[[1,3]]

(b    1
 a    0
 d    3
 dtype: int64,
 b    1
 d    3
 dtype: int64)

In [73]:
# The reason to prefer loc is when index is integer. In this case if we use [ints] it will return by label
obj1 = pd.Series([1,2,3], index=[2, 0, 1])
obj2 = pd.Series([1,2,3], index=["a","b","c"])

obj1[[0,1,2]], obj1.loc[[0,1,2]], obj2[[0,1,2]]
# obj.loc[[0,1,2]] will fail

  obj1[[0,1,2]], obj1.loc[[0,1,2]], obj2[[0,1,2]]


(0    2
 1    3
 2    1
 dtype: int64,
 0    2
 1    3
 2    1
 dtype: int64,
 a    1
 b    2
 c    3
 dtype: int64)

In [74]:
# loc oparator indexes exclusively with labels and iloc indexes exclusievely with integers
obj1.iloc[[0,1,2]], obj2.iloc[[0,1,2]]

(2    1
 0    2
 1    3
 dtype: int64,
 a    1
 b    2
 c    3
 dtype: int64)

In [75]:
# Slicing with labels also works with a difference from normal Python slicing in that the endpoint is inclusive:
obj2.loc["b":"c"]

b    2
c    3
dtype: int64

In [91]:
# Indexing data frame with single value or sequence returns one or more columns 
data = pd.DataFrame(np.arange(16).reshape(4,4), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"])
data["two"], data[["three","one"]]

(Ohio         1
 Colorado     5
 Utah         9
 New York    13
 Name: two, dtype: int64,
           three  one
 Ohio          2    0
 Colorado      6    4
 Utah         10    8
 New York     14   12)

In [81]:
# Indexing like this has a few special cases, the first one is slicing or selecting dat with Boolean array
data[:2], data[data["three"]>5]

(          one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7,
           one  two  three  four
 Colorado    4    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15)

In [93]:
# Another use case is indexing with Boolean data frame. IF we take a look a Boolean Data Frame data < 5, we can use it to assing value 0 to each location where Boolean data frame has value True

data[data < 5] = 0

data < 5 , data

(            one    two  three   four
 Ohio       True   True   True   True
 Colorado   True  False  False  False
 Utah      False  False  False  False
 New York  False  False  False  False,
           one  two  three  four
 Ohio        0    0      0     0
 Colorado    0    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15)

## Selection on  DataFrame with loc and iloc

In [88]:
# Signle row selection is a Series
data, data.loc["Colorado"]

(          one  two  three  four
 Ohio        0    0      0     0
 Colorado    0    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15,
 one      0
 two      5
 three    6
 four     7
 Name: Colorado, dtype: int64)

In [94]:
# Multiple row selection returns new Data Frame
data.loc[["Colorado", "New York"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [97]:
# You can combine both row and column selection witj loc by separating selection by comma"
data.loc["Colorado", ["two", "three"]]

two      5
three    6
Name: Colorado, dtype: int64

In [101]:
# Some similar selections with integers using iloc

data.iloc[2], data.iloc[[2,1]], data.iloc[2, [3,0,1]], data.iloc[[1,2],[3,0,1]]

(one       8
 two       9
 three    10
 four     11
 Name: Utah, dtype: int64,
           one  two  three  four
 Utah        8    9     10    11
 Colorado    0    5      6     7,
 four    11
 one      8
 two      9
 Name: Utah, dtype: int64,
           four  one  two
 Colorado     7    0    5
 Utah        11    8    9)

In [108]:
# Both indexing functions work with slicing in addition to single labes and multiple labels
d1 = data.loc[:"Utah", "two"], 
d2 = data.iloc[:,:3][data.three > 5] 
# data
# d1
d2

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [109]:
# Boolean arrays can be used with loc but not wit iloc
data.loc[data.three >= 2]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## Integer indexing pitfalls
Pandas objects indexing by integer works differently then Pythons built in structures like lists or tuples

In [111]:
ser = pd.Series(np.arange(3.))
ser
# ser[-1] will fail in this case becaue there is not index label with value -1
ser2 = pd.Series(np.arange(3.), index=["a", "b", "c"])
ser2[-1] # this works as now Pandas knows that this is by index position and not by the index label

  ser2[-1] # this works as now Pandas knows that this is by index position and not by the index label


2.0

In [113]:
ser.iloc[-1] # this also works because there is no daoubth if this is by index label or by index

2.0

## Pitfalls with chained indexing

In [114]:
# Following examples work:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [116]:
data.iloc[2] = 5
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,5,5,5,5
New York,12,13,14,15


In [117]:
data.loc[data["four"]>5] = 3
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [128]:
# Following will not work:
# data.loc[data["three"] == 5]["three"] = 6
# the solution is following
data.loc[data["three"] == 5, "three"] = 6
data
# The good rule of thumb is to avoid chained indexing when doing assignment

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


## Arithmetic and Data Alingment
Pandas makes it simple to work with objects that have different indexes. For example if object is added to objec, if any index pairs are not the same, the respective result will be union of the index pairs.

In [130]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])

# The missing values are introduced in each object where index does not exist. Those missing values are used in the operation.
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [133]:
# Data alignment in case of data frame is performed both on rows and columns
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])


df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


## Arithmetic methods with fill values

In [137]:
# How to fill with specific values when axis is found in one and not in another object
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)), columns=list("abcde"))
df2.loc[1,"b"] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [138]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [139]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [140]:
# Arithmetic methods of DataFrame all have counterpart starting with r which is an inverse of the method
# For example below are the same and a counterpart of div

1/df1 , df1.rdiv(1)

(       a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909,
        a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909)

In [142]:
# Relatedly, when reindexing the data frame a different fill value can be specified

df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


## Operations between data frames and series

In [144]:
# As with numpy arrays od different dimensions, arithmetic between DataFrame and Series is also defined. Consider difference between two-dim array and one of its columns
arr = np.arange(12.).reshape((3,4))
arr - arr[0]
# When we substract array[0] from the array the substraction is performed once for each row. This is refered to as broadcasting.

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [148]:
# Operations between DataFrame and Series are similar
frame = pd.DataFrame(np.arange(12).reshape((4,3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
series = frame.iloc[0]
frame
# series

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [149]:
frame - series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [151]:
# If and index is not found in either DataFrame columns or Series index, the object will be reindexed to form a union
series2 = pd.Series(np.arange(3), index=list("bef"))
# series2

frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [156]:
# To broadcast over the columns, metching over the rows the arithmetics method can be used with specification to match over the index
series3 = frame["d"]
series3

frame.sub(series3, axis="index")

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


## Function Application and Mapping

In [158]:
# Numpy ufunc ((universal functions) element-wise array methods) also work with pandas objects:
frame = pd.DataFrame(np.random.standard_normal((4,3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,-0.850417,-0.579274,0.515176
Ohio,-1.111794,0.861133,-1.06823
Texas,-0.330325,-0.357814,-0.331791
Oregon,-0.175616,0.307683,-1.179997


In [159]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.850417,0.579274,0.515176
Ohio,1.111794,0.861133,1.06823
Texas,0.330325,0.357814,0.331791
Oregon,0.175616,0.307683,1.179997


In [161]:
# Another frequent operation is applying function on onde dimensional arrays to each row or the column of the data frame

def f1(x):
    return x.max() - x.min()

frame.apply(f1)

b    0.936179
d    1.440408
e    1.695173
dtype: float64

In [162]:
# If we pass axis="column" the function will be applied to rows
frame.apply(f1, axis="columns")

Utah      1.365593
Ohio      1.972928
Texas     0.027489
Oregon    1.487680
dtype: float64

In [163]:
# The function passed to apply can return Series also
def f2(x):
    return pd.Series([x.min(), x.max()], index=["min", "max"])

frame.apply(f2)

Unnamed: 0,b,d,e
min,-1.111794,-0.579274,-1.179997
max,-0.175616,0.861133,0.515176


In [166]:
# Element-wise Python functions can be used. To compute formated string from each floating point value in frame the aplymap function can be used
def my_format(x):
    return f"{x:.2f}"

frame.map(my_format)

Unnamed: 0,b,d,e
Utah,-0.85,-0.58,0.52
Ohio,-1.11,0.86,-1.07
Texas,-0.33,-0.36,-0.33
Oregon,-0.18,0.31,-1.18


In [168]:
# The Series also has map function
frame["e"].map(my_format)

Utah       0.52
Ohio      -1.07
Texas     -0.33
Oregon    -1.18
Name: e, dtype: object

## Sorting and Ranking

In [170]:
# sort_index is the method that sorts lexicogrpahically by row or column label and returns new sorted object
obj = pd.Series(np.arange(4), index=list("dabc"))

obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [172]:
# With DataFrame sorting indexes can be done by each axis:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=["three","one"], columns=list("dabc"))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [173]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [174]:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [176]:
# sort_values sorts by values
obj = pd.Series([4, 7, -3, 3])

obj.sort_values()

2   -3
3    3
0    4
1    7
dtype: int64

In [177]:
# Any missing values are sorted to the end of Series by default

obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [178]:
# Missing values can be sorted to the start using na_position option:
obj.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [180]:
# When sorting a DataFrame, the data in one or more columns can be used as sorting key.
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})

frame.sort_values("b")

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [182]:
frame.sort_values(["a", "b"])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [183]:
# Rankinig assigns numbers from 1 to number of elements in the series. When it is tie it assignes mean rank.
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [185]:
# Rank can be set accordining to the order elements are observed in the data
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [186]:
# Descending order ranking
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [190]:
# Data frame can compute rank over the rows or the columns
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [191]:
frame.rank(axis="columns")


Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## Axis Indexes with Duplicate Labels

In [193]:
# Indexes are more often unique and functions like reindex require unique indexes. Hovewer it can be non unique:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [194]:
# To check if it is unique:
obj.index.is_unique

False

In [195]:
# This will return a Series because od duplicate in the index
obj["a"]

a    0
a    1
dtype: int64

In [196]:
# And when it is not a duplicate the data selection will return scalar
obj["c"]

4

## Summarizing and Computing Descriptive Statistics

In [204]:
# Pandas objects are equiped with the ser od common mathematical and statistical methods. Most of them fall in the category of reduction and summary statistics.
# Compared with similar methods in numpy  arrays pandas objects handle missing values by default.
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], 
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [203]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [202]:
df.sum(axis="columns")

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [206]:
# Some aggregations, like mean, require at least one non-NA value to yield a value result, so here we have:
df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [207]:
# Some methods like idxmin and idxmax return indirect statistics, like the index value where the min or max values are attained
df.idxmax(), df.idxmin()

(one    b
 two    d
 dtype: object,
 one    d
 two    b
 dtype: object)

In [208]:
# Other methods are accumulations
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [210]:
# Some methods are neither reductions nor accumulations. One of them is describe which gives multiple summary statistics
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [211]:
# On nonnumeric data, describe produces alternative summary statistics:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## Correlation and Covariance

In [212]:
# Some summary statistics like corelation and covariance are computed form pair of arguments.
price = pd.read_pickle("../data/book_pfda/yahoo_price.pkl")
volume = pd.read_pickle("../data/book_pfda/yahoo_volume.pkl")

In [213]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [215]:
# Series corr
returns["MSFT"].corr(returns["IBM"])

0.49976361144151144

In [216]:
# Seried cov
returns["MSFT"].cov(returns["IBM"])

8.870655479703546e-05

In [217]:
# Data frame corr
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [219]:
# Data frame cov
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [221]:
# corrwith can be used to calculate corr of data frames columns or rows with another Series
returns.corrwith(returns["IBM"])

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [222]:
# Passing a data frame to corrwith computes corr of matching collumn names
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

##  Unique values, Value count and Membership

In [228]:
# Another class of methods extract information about values in the Series
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [232]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [233]:
uniques.sort()
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [234]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [235]:
# value_counts is also available as a top level pandas method that can be used on numpy arrays or other Python sequences
pd.value_counts(obj.to_numpy(), sort=False)

  pd.value_counts(obj.to_numpy(), sort=False)


c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [237]:
# isin performs a vectorized set memenership check and can be usefull in filtering dataset down to a subset of values in a Series or a column in DataFrame
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [239]:
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [240]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [243]:
# Index.get_indexer returns array of indexes of possible non distinct values into anorher array of distinict values
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
unique_vals = pd.Series(["c", "b", "a"])
indices = pd.Index(unique_vals).get_indexer(to_match)
indices

array([0, 2, 1, 1, 0, 2])

In [244]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],
                     "Qu2": [2, 3, 1, 2, 3],
                     "Qu3": [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [246]:
# We can compute value counts for a single column like this
data["Qu1"].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [254]:
# To compute this for all columns pass value_counts to DataFrame apply method
data.apply(pd.value_counts).fillna(0)

  data.apply(pd.value_counts).fillna(0)


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [257]:
# The DataFrame.value_counts considers each row as a tuple and counts number of occurrencies of a distinct tupple
data = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [0, 0, 1, 0, 0]})
data.value_counts()

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64