In [1]:
import pandas as pd
import numpy as np

## Reindexing

 An important method on pandas objects is reindex, which means to create a new
 object with the values rearranged to align with the new index. 

In [2]:
obj = pd.Series([1, 2, 3, 4], index = ['d', 'b', 'a', 'c'])

In [3]:
obj

d    1
b    2
a    3
c    4
dtype: int64

In [4]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a    3.0
b    2.0
c    4.0
d    1.0
e    NaN
dtype: float64

In [5]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [6]:
obj3.reindex(np.arange(6))

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [7]:
obj3.reindex(np.arange(6), method = 'ffill') # which forward-fills the values:

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [7]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index=['a', 'b', 'c'],
                    columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [8]:
frame2 = frame.reindex(index=['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [9]:
frame2 = frame.reindex(index=['a', 'b', 'c', 'd'], method = 'ffill')
frame2

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8
d,6,7,8


 The columns can be reindexed with the columns keyword:

In [10]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


Another way to reindex a particular axis is to pass the new axis labels as a positional
 argument and then specify the axis to reindex with the axis keyword:

In [11]:
frame.reindex(states, axis = 'columns')

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


![reindexing](Assets/reindex_methods.png)

 We can also reindex by using the loc operator

In [12]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [13]:
frame.loc[['a', 'b', 'c'], ['California', 'Ohio']]

Unnamed: 0,California,Ohio
a,2,0
b,5,3
c,8,6


## Dropping Entries from an Axis

In [14]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])

In [15]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [16]:
obj['c'] = 3

In [18]:
obj

a    0.0
b    1.0
c    3.0
d    3.0
e    4.0
dtype: float64

In [19]:
obj.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [20]:
obj.drop(['d', 'e'])

a    0.0
b    1.0
c    3.0
dtype: float64

With DataFrame, index values can be deleted from either axis. To illustrate this, we
 first create an example DataFrame:

In [17]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),                    
                    index=["Ohio", "Colorado", "Utah", "New York"],
                     columns=["one", "two", "three", "four"])

In [18]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [23]:
data.drop(index = ['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


To drop labels from the columns, instead use the columns keyword

In [19]:
data.drop(columns=['two'])

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [20]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [26]:
data.drop('Utah', axis=0)

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,12,13,14,15


In [21]:
data.drop(['two', 'three'], axis = 'columns')

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


## Indexing, Selection, and Filtering

In [22]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])

In [23]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [24]:
obj['a']

0.0

In [25]:
obj[2:]

c    2.0
d    3.0
dtype: float64

In [26]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [27]:
obj[['b', 'c']]

b    1.0
c    2.0
dtype: float64

In [28]:
obj[[1, 3]]

  obj[[1, 3]]


b    1.0
d    3.0
dtype: float64

In [29]:
obj.iloc[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [30]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

 the preferred way to select index values is
 with the special loc operator:

In [31]:
obj.loc[['d', 'a']]

d    3.0
a    0.0
dtype: float64

In [32]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])
print(obj1)
print(obj2)

2    1
0    2
1    3
dtype: int64
a    1
b    2
c    3
dtype: int64


In [33]:
obj1.iloc[[0, 1, 2]]

2    1
0    2
1    3
dtype: int64

In [34]:
obj2.iloc[[0, 1, 2]]

a    1
b    2
c    3
dtype: int64

In [35]:
obj2.loc['b':]

b    2
c    3
dtype: int64

In [36]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),                   
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"])

In [37]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [38]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [39]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [40]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


Another use case is indexing with a Boolean DataFrame, such as one produced by
 a scalar comparison.

In [41]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
data[data<5] = 0

In [43]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Selection on DataFrame with loc and iloc

In [44]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [45]:
data.loc['Utah']

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [46]:
data.loc[['Ohio','Utah'], ['two', 'four']]

Unnamed: 0,two,four
Ohio,0,0
Utah,9,11


In [47]:
data.loc['Utah', ['one', 'three']]

one       8
three    10
Name: Utah, dtype: int32

In [48]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
data.iloc[1:, :2]

Unnamed: 0,one,two
Colorado,0,5
Utah,8,9
New York,12,13


In [50]:
data.iloc[[1, 3], [1, 2]]

Unnamed: 0,two,three
Colorado,5,6
New York,13,14


Both indexing functions work with slices in addition to single labels or lists of labels

In [57]:
data.loc[:'Utah', 'three']

Ohio         0
Colorado     6
Utah        10
Name: three, dtype: int32

In [51]:
data.iloc[:, :2][data.three > 5]

Unnamed: 0,one,two
Colorado,0,5
Utah,8,9
New York,12,13


In [52]:
data.loc[data['three'] >= 2, ['one', 'three']]

Unnamed: 0,one,three
Colorado,0,6
Utah,8,10
New York,12,14


![Index](Assets/index_operations.png)


In [53]:
ser = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser[-1]

  ser[-1]


2.0

In [54]:
ser.iloc[-1]

2.0

In [55]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [56]:
data.loc[:, 'one'] = 1
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [64]:
data.iloc[2] = 5
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,5,5,5,5
New York,1,13,14,15


In [65]:
data.loc[data.three == 5, 'three'] = 6
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,5,5,6,5
New York,1,13,14,15


## Arithmetic and Data Alignment

In [57]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])

In [58]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],            
index=["a", "c", "e", "f", "g"])

In [59]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [60]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [61]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [62]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),                  
index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),                  
index=["Utah", "Ohio", "Texas", "Oregon"])
print(df1)
print(df2)
print(df1+df2)

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


 Arithmetic methods with fill values

In [63]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                  columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20,).reshape((4, 5)),
                  columns = list('abcde'))
print(df1)
print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19


In [64]:
df2.loc[1, 'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0,1.0,2,3,4
1,5,,7,8,9
2,10,11.0,12,13,14
3,15,16.0,17,18,19


In [74]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


 Using the add method on df1, I pass df2 and an argument to fill_value, which
 substitutes the passed value for any missing values in the operation:

In [75]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [76]:
1 / df2

Unnamed: 0,a,b,c,d,e
0,inf,1.0,0.5,0.333333,0.25
1,0.2,,0.142857,0.125,0.111111
2,0.1,0.090909,0.083333,0.076923,0.071429
3,0.066667,0.0625,0.058824,0.055556,0.052632


![methods](Assets\arithmetic_methods.png)

## Function Application and Mapping

In [65]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.697273,-0.282507,-0.772451
Ohio,1.575396,-1.199649,-0.824856
Texas,-0.971155,1.471094,-0.962064
Oregon,-0.381745,0.603376,-1.491394


In [66]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.697273,0.282507,0.772451
Ohio,1.575396,1.199649,0.824856
Texas,0.971155,1.471094,0.962064
Oregon,0.381745,0.603376,1.491394


Another frequent operation is applying a function on one-dimensional arrays to each
 column or row. DataFrame’s apply method does exactly this:
 
 Here the function f, which computes the difference between the maximum and
 minimum of a Series, is invoked once on each column in frame. The result is a Series
 having the columns of frame as its index.

In [67]:
def f1(x):
    return x.max() - x.min()
frame.apply(f1)

b    2.668428
d    2.670743
e    0.718943
dtype: float64

In [68]:
frame.apply(f1, axis=1)

Utah      2.469724
Ohio      2.775045
Texas     2.442249
Oregon    2.094769
dtype: float64

In [69]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f2)

Unnamed: 0,b,d,e
min,-0.971155,-1.199649,-1.491394
max,1.697273,1.471094,-0.772451


## Sorting and Ranking

In [70]:
obj = pd.Series(np.arange(4.), index=list('dbac'))
obj

d    0.0
b    1.0
a    2.0
c    3.0
dtype: float64

In [71]:
obj.sort_index()

a    2.0
b    1.0
c    3.0
d    0.0
dtype: float64

In [72]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index = ['three', 'one'],
                    columns=list('dabc'))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [73]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [74]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [75]:
frame.sort_index(axis='columns', ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


To sort a Series by its values, use its sort_values method:

In [76]:
obj = pd.Series([3, 1, 4, 2])

In [77]:
obj.sort_values()

1    1
3    2
0    3
2    4
dtype: int64

In [78]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

 Missing values can be sorted to the start instead by using the na_position option

In [91]:
obj.sort_values(na_position='first')

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [92]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})

In [93]:
frame.sort_values('a')

Unnamed: 0,b,a
0,4,0
2,-3,0
1,7,1
3,2,1


In [94]:
frame.sort_values(['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


##  Axis Indexes with Duplicate Labels

In [95]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])

In [96]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [97]:
obj.index.is_unique

False

In [98]:
obj['a']

a    0
a    1
dtype: int32

## Summarizing and Computing Descriptive Statistics

In [99]:
df = pd.DataFrame([
    [1.4, np.nan],
    [7.1, -4.5],
    [np.nan, np.nan],
    [0.75,-1.3]
], index=list('abcd'),
columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [100]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [101]:
df.sum(axis=0)#or 'indez\x'

one    9.25
two   -5.80
dtype: float64

In [102]:
df.sum(axis=1)# or 'columns'

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [103]:
df.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

![Reduction](Assets\reduction_methods.png)

Some methods, like idxmin and idxmax, return indirect statistics, like the index value
 where the minimum or maximum values are attained:

In [104]:
df.idxmax()

one    b
two    d
dtype: object

In [105]:
df.idxmin()

one    d
two    b
dtype: object

In [106]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [107]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


![stats](Assets\stats.png)

In [108]:
df.count()

one    3
two    2
dtype: int64

In [109]:
df.quantile()

one    1.4
two   -2.9
Name: 0.5, dtype: float64

## Correlation and Covariance

In [110]:
df.corr()

Unnamed: 0,one,two
one,1.0,-1.0
two,-1.0,1.0


## Unique Values, Value Counts, and Membership

In [111]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [112]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [113]:
uniques = obj.nunique()
uniques

4

In [114]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [115]:
pd.value_counts(obj.to_numpy(), sort=False)

  pd.value_counts(obj.to_numpy(), sort=False)


c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [116]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [117]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [118]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [119]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],                     
                    "Qu2": [2, 3, 1, 2, 3],  
                    "Qu3": [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [120]:
data['Qu1'].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64