# Function Application & Mapping in Pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
frame = pd.DataFrame(np.random.randn(4,3),    # Numpy random function randn is used to create 4 x 3 ndarray
                    columns=list('bcd'),      # column labels are passed as 'b', 'c' & 'd'
                    index=['Utah', 'Othio', 'Texas', 'Oregon'])  # custom row-indexes are passed

print(frame)

               b         c         d
Utah    0.525488  1.280144 -0.354225
Othio  -0.741145 -0.108834 -2.101504
Texas   0.473022  0.460204  0.652968
Oregon  0.689122 -0.003189 -0.372184


In [3]:
# This numpy function converts negative values into positive

print(np.abs(frame))

               b         c         d
Utah    0.525488  1.280144  0.354225
Othio   0.741145  0.108834  2.101504
Texas   0.473022  0.460204  0.652968
Oregon  0.689122  0.003189  0.372184


## Displaying Minimun, Maximum Values from a DataFrame

In [4]:
# DataFrame original state

print(frame)

               b         c         d
Utah    0.525488  1.280144 -0.354225
Othio  -0.741145 -0.108834 -2.101504
Texas   0.473022  0.460204  0.652968
Oregon  0.689122 -0.003189 -0.372184


In [5]:
# This will return minimum value of colum 'c'

print(frame['c'].min())

-0.10883377381189312


In [6]:
# This will return maximum value of colum 'c'

print(frame['c'].max())

1.2801443740765666


In [7]:
# This will return difference of Maximum and Minimum Values of DataFrame i.e. frame

print(frame['c'].max() - frame['c'].min())

1.3889781478884597


In [8]:
# lambda function to store in variable 'x', the difference of min() & Mix() of each column of frame

f = lambda x: x.max() - x.min()
df = frame.apply(f)
print(df, type(df))

b    1.430267
c    1.388978
d    2.754471
dtype: float64 <class 'pandas.core.series.Series'>


In [9]:
# Now, function 'f' will apply on each row-index of frame, and will find diff of max and min value of each row.

df = frame.apply(f, axis=1)
print(df)

Utah      1.634369
Othio     1.992670
Texas     0.192764
Oregon    1.061306
dtype: float64


In [10]:
# A function min_max is defined to create a new Pandas Series having two index-rows 
# with min and max values of taxee columns

def min_max(x):
    minimum = x.min()    # this will derive minimum value from each column
    maximum = x.max()    # this will derive maximum value from each column
    return pd.Series( [minimum, maximum ], index=['min', 'max'] )  # this will return new series of min and max values

df = frame.apply(min_max)       # function call stored in variable 'df'
print(df, type(df))

            b         c         d
min -0.741145 -0.108834 -2.101504
max  0.689122  1.280144  0.652968 <class 'pandas.core.frame.DataFrame'>


In [11]:
# Another way of above quoted function

def min_max(x):    
    return pd.Series( [ x.min(), x.max() ], index=['min', 'max'] )  # this will return new series of min and max values

df = frame.apply(min_max)       # function call stored in variable 'df'
print(df, type(df))

            b         c         d
min -0.741145 -0.108834 -2.101504
max  0.689122  1.280144  0.652968 <class 'pandas.core.frame.DataFrame'>


# Sorting and Ranking

In [12]:
# A new Pandas DataFrame 'frame2' is created

frame2 = pd.DataFrame(np.arange(8).reshape((2,4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])

print(frame2)

       d  a  b  c
three  0  1  2  3
one    4  5  6  7


## Sorting by order of 'Index'

In [13]:
print(frame2.sort_index(axis=1, ascending=False))

       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [14]:

print(frame2.sort_index(axis=1, ascending=False))

       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [15]:
# This will again the DataFrame and columns will be sorted in its original shape 
# but row index will be sorted in reverse order

print(frame2.sort_index())

       d  a  b  c
one    4  5  6  7
three  0  1  2  3


## Sorting by order of 'Values'

In [16]:
frame = pd.DataFrame(np.random.randn(4,3),    # Numpy random function randn is used to create 4 x 3 ndarray
                    columns=list('bcd'),      # column labels are passed as 'b', 'c' & 'd'
                    index=['Utah', 'Othio', 'Texas', 'Oregon'])  # custom row-indexes are passed

print(frame)

               b         c         d
Utah   -1.772203 -0.331749 -0.898691
Othio   2.001659  1.332388 -0.851815
Texas   0.879690  0.032887  0.292377
Oregon -0.247259 -0.503726 -0.533180


In [17]:
# This will sort by values of column-'b'

print(frame.sort_values(by='b'))

               b         c         d
Utah   -1.772203 -0.331749 -0.898691
Oregon -0.247259 -0.503726 -0.533180
Texas   0.879690  0.032887  0.292377
Othio   2.001659  1.332388 -0.851815


In [18]:


print(frame.rank(ascending=False, method='max'))

          b    c    d
Utah    4.0  3.0  4.0
Othio   1.0  1.0  3.0
Texas   2.0  2.0  1.0
Oregon  3.0  4.0  2.0


In [19]:
print(frame.rank(ascending=True, method='min'))

          b    c    d
Utah    1.0  2.0  1.0
Othio   4.0  4.0  2.0
Texas   3.0  3.0  4.0
Oregon  2.0  1.0  3.0


In [20]:
print(frame.rank(axis='columns'))

          b    c    d
Utah    1.0  3.0  2.0
Othio   3.0  2.0  1.0
Texas   3.0  1.0  2.0
Oregon  3.0  2.0  1.0


# Summarizing & Computing the Descriptive Statistics

In [21]:
# Another DataFrame 'df' is created using Numpy 'np.nan' value

df = pd.DataFrame([
    [1.4, np.nan],[7.1, -4.5],
    [np.nan, np.nan], [0.75, -1.3]],
    index = ['a', 'b', 'c', 'd'], columns=['one', 'two'])

print(df)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


In [22]:
# Summing up both columns using 'sum()'

df.sum()

one    9.25
two   -5.80
dtype: float64

In [23]:
# Similar to above, that is 'axis=rows'
# Summing up both columns using 'sum()'

df.sum(axis='rows')

one    9.25
two   -5.80
dtype: float64

In [24]:
# Similar to both of above, that is 'axis=0' or 'axis=rows'
# Summing up both columns using 'sum()'

df.sum(axis = 0)

one    9.25
two   -5.80
dtype: float64

In [25]:
# row-wise sum of both columns is performed

df.sum(axis = 'columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [26]:
# This is similar to 'axis = columns'
# row-wise sum of both columns is performed

df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [27]:
print(df)
print('* * * * * * * * * *')

# This will calculate mean (average) of each row
# 'skipna = False' will not skip the NaN values and will not perform mean where found
x = df.mean(axis='columns', skipna = False)

print(x)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
* * * * * * * * * *
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


In [28]:
# This will calculate row-wise mean
# 'skipna = True' will skip the 'NaN' values resultantly will calculate mean even found 'NaN' values in that row

y = df.mean(axis='columns', skipna = True)
print(y)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64


In [29]:
# This will also calculate 'Mean'
# Eliminating 'skipna' is similar to 'skipna = True'

y = df.mean(axis='columns')
print(y)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64


In [30]:
# Another DataFrame 'df' is created using Numpy 'np.nan' value

df2 = pd.DataFrame([
    [1.4, 1.4, 1.5, np.nan, ],[7.1, -4.5, 1.5, 1.4],
    [1.4, np.nan, 0.5, np.nan ], [0.75, -1.3, 1.3, np.nan]],
    index = ['a', 'b', 'c', 'd'], columns=['one', 'two', 'three', 'four'])

print(df2)

    one  two  three  four
a  1.40  1.4    1.5   NaN
b  7.10 -4.5    1.5   1.4
c  1.40  NaN    0.5   NaN
d  0.75 -1.3    1.3   NaN


In [31]:
# This will return unique values of desired columns

print(df2['one'].unique(), df2['two'].unique())

[1.4  7.1  0.75] [ 1.4 -4.5  nan -1.3]


In [32]:
# this will return count of each value in desired column of DataFrame i.e. 'df'

print(df2['one'].value_counts())

1.40    2
0.75    1
7.10    1
Name: one, dtype: int64


## Selection of data using 'loc' & 'iloc' attribute

- In the 'loc' method:
    - First, specify row-label
    - Then specify column-names
    - remember! multiple row-labels and column-names require 'array notation'
    - For example, print(data_df.loc['colorado', ['two', 'three']])

- In the 'iloc' method:
    - 'iloc' represents 'index-location'
    - Here, similar to 'loc' method row-index is specified firt
    - Then column-index is specified

In [33]:
# A 4 x 4 ndarray is created using Numpy arange function and stored in Pandas DataFrame i.e. 'data_df'

data_df = pd.DataFrame(np.arange(16).reshape(4,4),
                     index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                     columns = ['one', 'two', 'three', 'four'])

print(data_df)                     

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [34]:
# 'loc' attribute is used to display all values of columns 'two', 'four' & 'three' of row-index 'Colorado'
#  in 'loc' method, row-labels and column-names are used

print(data_df.loc['Colorado', ['two', 'four', 'three']])

two      5
four     7
three    6
Name: Colorado, dtype: int32


In [35]:
# 'loc' attribute is used to select all values of columns 'two', 'four' & 'three' of row-index 'Colorado' & 'Ohio'
#  in 'loc' method, row-labels and column-names are used

print(data_df.loc[['Colorado', 'Ohio'], ['two', 'four', 'three']])

          two  four  three
Colorado    5     7      6
Ohio        1     3      2


In [36]:
# Similar selection is perform but with 'iloc' attribute
# Here, instead of index-labels and column-names, row-index and column-index are specified to perform the selection
# In Numpy, this style of selection is called 'fancy-indexing'

print(data_df.iloc[[1, 0], [1, 3, 2]])

          two  four  three
Colorado    5     7      6
Ohio        1     3      2


In [37]:
# 'iloc' attribute is used to select data using index-number of row-lables and then column-indexes

data_df.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [38]:
# 'iloc' attribute is used to display data of two rows using index-number of row-lables and then column-indexes
# this will return data of rows 'third till end i.e. four' and columns-index are ordered 3, 0 and in the last 1

data_df.iloc[2:, [3, 0, 1]]

Unnamed: 0,four,one,two
Utah,11,8,9
New York,15,12,13


In [39]:
# in this example, 'iloc' attribute will display all rows and all columns of DataFrame i.e. 'data_df'

print(data_df.iloc[:])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [40]:
# This will display all rows and only first three columns
# whose index numbers are 0, 1, & 2 i.e. columns One, two and three
# It clarifies, if we need all rows but some columns therefore, this type of selection should be specified i.e.[:,:3]
# In the case [:,:3] means, all row are required but only column index-0 to 2 are required.

print(data_df.iloc[ :, :3 ])

          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


In [41]:
# This will display data of only two rows that are at index 0 & 1 and it will display all columns

print(data_df.iloc[:2])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7


In [42]:
# This will return boolean value to display whether values of column-label 'three' is greater than 5 or otherwise
# row-index 'Ohio' has value 2 in column-label 'three', so it will display 'False' and all other will be 'True'

print(data_df.three > 5)

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool


# Arithmatic & Data Alignment

In [43]:
# created a 3 x 3 Pandas DataFrame i.e 'df1'

df1 = pd.DataFrame(np.arange(9).reshape(3,3),
                  columns = list('bcd'),
                  index=['Ohio', 'Texas', 'Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [44]:
#  Created another 4 x 3 shaped DataFrame i.e. 'df2'

df2 = pd.DataFrame(np.arange(12).reshape(4,3),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [45]:
# Addition of both DataFrames
# It will add two DataFrames and action of addition will be performed upon 'index matching' 
# Otherwise, non-matching indexes will display the value as 'NaN'

print(df1)
print('* * * * * * * * * * * * * * * *')
print(df2)
print('* * * * * * * * * * * * * * * *')
df3 = df1 + df2
print(df3)

          b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
* * * * * * * * * * * * * * * *
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
* * * * * * * * * * * * * * * *
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


## Arithmatic Methods with Fill Values

In [46]:
df3 = pd.DataFrame(np.arange(12).reshape(3,4),
                  columns = list('abcd'))

df4 = pd.DataFrame(np.arange(20).reshape(4,5),
                  columns = list('abcde'))
print(df3)
print('* * * * * * * * * * * * * * * *')
print(df4)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
* * * * * * * * * * * * * * * *
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19


In [47]:
# Changing value of 'df3' row-index 1 at Column-'b' i.e. 5, with 'NaN'

print(df3)     # this will display the original DataFrame, that was initially created with no 'NaN' value
print('* * * * * * * * * * *')
print(df3.loc[1, 'b'])  # this will disply the value of Row-Index '1' at Column-Label 'b' i.e. 5
print('* * * * * * * * * * *')
df3.loc[1, 'b'] = np.nan   # this will replace the above-quoted value 5 with Numpy 'np.nan'
print(df3)      # this will display the modified DataFrame

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
* * * * * * * * * * *
5
* * * * * * * * * * *
   a    b   c   d
0  0  1.0   2   3
1  4  NaN   6   7
2  8  9.0  10  11


In [48]:
# This is just to display both DataFrames

print("This is 'df3'")
print(df3)
print('* * * * * * * * * * *')
print()
print("This is 'df4'")
print(df4)

This is 'df3'
   a    b   c   d
0  0  1.0   2   3
1  4  NaN   6   7
2  8  9.0  10  11
* * * * * * * * * * *

This is 'df4'
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19


In [49]:
# Addition of two DataFrames without 'Fill Values' Attribute
# This will display 'NaN' where index mis-matches

print("Direct Addition without 'Fill_Value'")
df5 = df3 + df4
print(df5)

Direct Addition without 'Fill_Value'
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [50]:
# Addition of two DataFrames using 'add()' method instead of simple '+' method
# This will fill 'NaN' values with 'zero' and will perform addition

print("Addition using a method with replacing 'NaN' with 0")
df6 = df3.add(df4, fill_value=0)
print(df6)

Addition using a method with replacing 'NaN' with 0
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   6.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


# Operations between Pandas DataFrame and Pandas Series

In [51]:
# Created a new DataFrame of sixe 4 x 3

dFrame = pd.DataFrame(np.arange(12).reshape(4,3),
                     columns = list('bde'),
                     index = ['Otah', 'Ohio', 'Texas', 'Oregon'])

dFrame

Unnamed: 0,b,d,e
Otah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [52]:
# Created a new Pandas Series 'mySeries' using Pandas 'iloc' method

mySeries = dFrame.iloc[0]
mySeries

b    0
d    1
e    2
Name: Otah, dtype: int32

In [53]:
# Created another Series using 'Direct Entry' method

mySeries2 = pd.Series([1, 2, 3], index=list('bde'))
mySeries2

b    1
d    2
e    3
dtype: int64

In [54]:
# Displayed index and values separately

print(mySeries2.index, mySeries2.values)

Index(['b', 'd', 'e'], dtype='object') [1 2 3]


In [55]:
# Arithmatic Operation between DataFrame & Series
# This will substract each element of Series i.e. [1 2 3] from each index of DataFram 'dFrame'

print(" * * * DataFrame Before Subtraction * * * ")
print(dFrame)
print()
print(" * * * Series * * * ")
print(mySeries.index, mySeries2.values)
print()

# DataFrame columns names will be matched with series index it's row-wise broadcasting operation
print(" * * * DataFrame after Substracting Series * * * ")
print(dFrame - mySeries2)

 * * * DataFrame Before Subtraction * * * 
        b   d   e
Otah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11

 * * * Series * * * 
Index(['b', 'd', 'e'], dtype='object') [1 2 3]

 * * * DataFrame after Substracting Series * * * 
        b  d  e
Otah   -1 -1 -1
Ohio    2  2  2
Texas   5  5  5
Oregon  8  8  8


# Deleting Data (Row or Column from DataFrame)

In [56]:
data_df = pd.DataFrame(np.arange(16).reshape(4,4),
                      index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                      columns = ['one', 'two', 'three', 'four'])
data_df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


***Note:- DataFrame '.drop()' method would return a new DataFrame without having deleted rows. So, store it into a DataFrame 
variable or use 'inplace = True' while droping 'rows'  or 'columns'***

In [57]:
# Here, it will drop column 'two', which is situated at axis=1

data_df = data_df.drop('two', axis = 1)
print(data_df)

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


In [58]:
data_df.drop('three', axis = 1, inplace = True)
print(data_df)

          one  four
Ohio        0     3
Colorado    4     7
Utah        8    11
New York   12    15


In [59]:
# Here, two rows having index-names 'Ohio' & 'Colorado' have been dropped

data_df.drop(['Ohio', 'Colorado'], inplace = True)
print(data_df)

          one  four
Utah        8    11
New York   12    15


# Indexing, Selection & Filtering

In [60]:
data_df = pd.DataFrame(np.arange(16).reshape(4,4),
                      index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                      columns = ['one', 'two', 'three', 'four'])
data_df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [61]:
# It will select column labels 'one' & 'three'

print(data_df[['one', 'three']])

          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [62]:
# It will select rows with index number '2 till end'

print(data_df[2:])

          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15


In [63]:
# It will display 'True' or 'False' where values of Column-Label 'three' are greater than '5'

df2 = data_df['three'] > 5
print(df2)

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool


In [64]:
#  This will select and display all data of DataFrame where column 'three' has value greater than '5'

print(data_df[data_df['three'] > 5])

          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


## Selection by Dictionery Style or Attribute Style

In [65]:
# A 4 x 4 ndarray is created using Numpy arange function and stored in Pandas DataFrame i.e. 'data_df'

data_df = pd.DataFrame(np.arange(16).reshape(4,4),
                     index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                     columns = ['one', 'two', 'three', 'four'])

print(data_df)                     

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [66]:
# this command will select all records of column-label 'one'.
# this style of selection is called at python dictionery style.

print(data_df['one'])

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int32


In [67]:
# There is another option provided by Pandas to perform data selection
# This is called 'Attribute' Style selection
# In this method, sytax will be ====> DataFram_Name.Column_Lable

print(data_df.one)

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int32


In [68]:
# We can also perform further deep level selection within a column
# this will select rows having 'index-2 onwards' of column-lable 'one'

print(data_df.one[2:])

Utah         8
New York    12
Name: one, dtype: int32


In [69]:
# This is Python's Dictionery style procedure to do selection within a selection
# This will perform the similar selection as done by ===> print(data_df.one[2:])

print(data_df['one'][2:])

Utah         8
New York    12
Name: one, dtype: int32


In [70]:
# This will perform conditional filtering to select all row-indexes and column-labels where value of column 'three' is > 5

print(data_df[data_df['three'] > 5])

          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [71]:
# This will select boolean value of column 'three' whether it's > 5 or not?
# It will return 'False' in case of row-index 'Ohio' and 'True' against all other row-indexes.

print(data_df.three > 5)

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool


In [72]:
# This will select the values of column-label 'three' having > 5
# column-label 'three' is used as attribute style, so it will only selelct greater than '5' values of only column-'three'

print(data_df.three[data_df['three'] > 5])

Colorado     6
Utah        10
New York    14
Name: three, dtype: int32


# DataFrame Re-index Method with 'ffill'

- Re-index is a 'method' of Pandas DataFrame
    - Re-index has a parameter which is also called as 'method'
    - This parameter has a value which is called 'ffill'
    - 'ffill' will replace the 'NaN' value with the value of it's previous index

In [73]:
# An object created using Pandas Series with only three indexes

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 3, 6])
print(obj3)

0      blue
3    purple
6    yellow
dtype: object


In [74]:
# this will add indexes upto 9 but values against added indexes will not be updated, so will be displayed as 'NaN'
# Python range function is used to increase indexes of already created variable i.e. obj3

obj3 = obj3.reindex(range(9))
print(obj3)

0      blue
1       NaN
2       NaN
3    purple
4       NaN
5       NaN
6    yellow
7       NaN
8       NaN
dtype: object


In [75]:
# Now, index of variable 'obj3' are increased upto '9'
# 'method' attribute is used to fill the index with previous index-value

obj3 = obj3.reindex(range(9), method='ffill')
print(obj3)

0      blue
1       NaN
2       NaN
3    purple
4       NaN
5       NaN
6    yellow
7       NaN
8       NaN
dtype: object


In [76]:
# A new DataFrame i.e. 'states' is created

states = pd.DataFrame(np.arange(9).reshape(3,3),
                     index = ['a', 'c', 'd'],
                     columns = ['Ohio', 'Texas', 'California'])

print(states)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8


In [77]:
# Here, DataFrame i.e. 'states' has been 'reindexed' but without 'method=ffill', so row at index-b will be added
# without any value, so values be displayed as 'NaN'

states = states.reindex(['a', 'b', 'c', 'd'])
print(states)

   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


In [78]:
# Here, DataFrame i.e. 'states' is 'reindexed' with added 'column' i.e. 'e'
# 'method = ffill' will fill the indexes of added 'column-e' with the values of previous index i.e. index-'d'

states = states.reindex(['a', 'c', 'd', 'e'], method='ffill')
print(states)

   Ohio  Texas  California
a   0.0    1.0         2.0
c   3.0    4.0         5.0
d   6.0    7.0         8.0
e   6.0    7.0         8.0


# Changing Column Label (Name) using 'reindex' method

In [79]:
# DataFrame is again created

states = pd.DataFrame(np.arange(9).reshape(3,3),
                     index = ['a', 'c', 'd'],
                     columns = ['Ohio', 'Texas', 'California'])

print(states)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8


In [80]:
# Re-indexing of DataFrame in another way

states_names = ['Texas', 'Utah', 'California', 'Ohio']   # Created an Array of 'state_names'
print("* * * DataFame 'states' before 'reindexing'")
print(states)
print()
states = states.reindex(columns = states_names)
print("* * * DataFame 'states' after 'reindexing' i.e. adding a column-Utah")
print(states)

* * * DataFame 'states' before 'reindexing'
   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8

* * * DataFame 'states' after 'reindexing' i.e. adding a column-Utah
   Texas  Utah  California  Ohio
a      1   NaN           2     0
c      4   NaN           5     3
d      7   NaN           8     6


#### At this point, practice of Lesson-2 Videos of PIAIC Student Portal (Quarter-2) are completed