In [1]:
import numpy as np
import pandas as pd

### Series is 1D and DataFrames are 2D objects
- But why?
- And what exactly is index?

In [2]:
# Can we have multiple index? Let's see
index_val = [('cse',2019),('cse',2020),('cse',2021),('cse',2022),('ece',2019),('ece',2020),('ece',2021),('ece',2022)]
a = pd.Series([1,2,3,4,5,6,7,8],index=index_val)
a

(cse, 2019)    1
(cse, 2020)    2
(cse, 2021)    3
(cse, 2022)    4
(ece, 2019)    5
(ece, 2020)    6
(ece, 2021)    7
(ece, 2022)    8
dtype: int64

In [3]:
a[('cse',2022)]

4

In [4]:
# The problem?
a['cse']
# Both the index are not independent from each other

KeyError: 'cse'

In [5]:
# The solution -> multiindex Series (also known as Hierarchial indexing)
# multiple index levels within a single index

In [6]:
# How to create multiindex object
# 1. pd.MultiIndex.from_tuples()
index_val = [('cse',2019),('cse',2020),('cse',2021),('cse',2022),('ece',2019),('ece',2020),('ece',2021),('ece',2022)]
multi_index=pd.MultiIndex.from_tuples(index_val)

In [7]:
multi_index

MultiIndex([('cse', 2019),
            ('cse', 2020),
            ('cse', 2021),
            ('cse', 2022),
            ('ece', 2019),
            ('ece', 2020),
            ('ece', 2021),
            ('ece', 2022)],
           )

In [8]:
multi_index.levels

FrozenList([['cse', 'ece'], [2019, 2020, 2021, 2022]])

In [9]:
multi_index.levels[0]

Index(['cse', 'ece'], dtype='object')

In [10]:
multi_index.levels[1]

Index([2019, 2020, 2021, 2022], dtype='int64')

In [11]:
# 2. pd.MultiIndex.from_product()
pro_index_val = pd.MultiIndex.from_product([['cse','ece'],[2019,2020,2021,2022]])

In [12]:
pro_index_val

MultiIndex([('cse', 2019),
            ('cse', 2020),
            ('cse', 2021),
            ('cse', 2022),
            ('ece', 2019),
            ('ece', 2020),
            ('ece', 2021),
            ('ece', 2022)],
           )

In [13]:
# Creating a series with multiindex
s = pd.Series([1,2,3,4,5,6,7,8],index=multi_index)

In [14]:
s

cse  2019    1
     2020    2
     2021    3
     2022    4
ece  2019    5
     2020    6
     2021    7
     2022    8
dtype: int64

In [15]:
# How to fetch from such a series
s[('cse',2021)]

3

In [16]:
s['cse']

2019    1
2020    2
2021    3
2022    4
dtype: int64

In [17]:
# Here we are using 2D representation for multiindexing (as to access a value we need two things to provide -> branch and year) so why not use DataFrame

#### Unstack

In [18]:
temp = s.unstack() # neeche waala (from cse/ece) index becomes column (2019,2020,2021,2022)

In [19]:
temp

Unnamed: 0,2019,2020,2021,2022
cse,1,2,3,4
ece,5,6,7,8


#### Stack

In [20]:
temp.stack()

cse  2019    1
     2020    2
     2021    3
     2022    4
ece  2019    5
     2020    6
     2021    7
     2022    8
dtype: int64

##### multiindex -> convert higher dimension data to lower dimension data

In [21]:
# multiindex dataframe
branch_df1 = pd.DataFrame(
    [
        [1,3],
        [2,4],
        [5,7],
        [7,8],
        [9,10],
        [11,12],
        [12,13],
        [15,16]
    ],
    index = multi_index,
    columns = ['avg_package','students']
)
branch_df1 # This is 3D as we need to provide 3 things to get one data like if you want value from student column you need to pass branch, year, students

Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,3
cse,2020,2,4
cse,2021,5,7
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,12,13
ece,2022,15,16


In [22]:
branch_df1.loc['cse']

Unnamed: 0,avg_package,students
2019,1,3
2020,2,4
2021,5,7
2022,7,8


In [23]:
branch_df1.loc['ece']

Unnamed: 0,avg_package,students
2019,9,10
2020,11,12
2021,12,13
2022,15,16


In [24]:
branch_df1['avg_package']

cse  2019     1
     2020     2
     2021     5
     2022     7
ece  2019     9
     2020    11
     2021    12
     2022    15
Name: avg_package, dtype: int64

In [25]:
branch_df1['students']

cse  2019     3
     2020     4
     2021     7
     2022     8
ece  2019    10
     2020    12
     2021    13
     2022    16
Name: students, dtype: int64

In [26]:
# multiindex df from column perspective
branch_df2 = pd.DataFrame(
    [
        [1,2,0,0],
        [3,4,0,0],
        [5,6,0,0],
        [7,8,9,0]
    ],
    index = [2019,2020,2021,2022],
    columns = pd.MultiIndex.from_product([['Delhi','Mumbai'],['Avg_Package','Students']])
)
branch_df2 # column-wise hierarchical and is still 3D

Unnamed: 0_level_0,Delhi,Delhi,Mumbai,Mumbai
Unnamed: 0_level_1,Avg_Package,Students,Avg_Package,Students
2019,1,2,0,0
2020,3,4,0,0
2021,5,6,0,0
2022,7,8,9,0


In [27]:
branch_df1

Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,3
cse,2020,2,4
cse,2021,5,7
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,12,13
ece,2022,15,16


In [28]:
branch_df2['Delhi']

Unnamed: 0,Avg_Package,Students
2019,1,2
2020,3,4
2021,5,6
2022,7,8


In [29]:
branch_df2['Mumbai']

Unnamed: 0,Avg_Package,Students
2019,0,0
2020,0,0
2021,0,0
2022,9,0


In [30]:
branch_df2['Mumbai']['Avg_Package']

2019    0
2020    0
2021    0
2022    9
Name: Avg_Package, dtype: int64

In [31]:
branch_df2.loc[2019]

Delhi   Avg_Package    1
        Students       2
Mumbai  Avg_Package    0
        Students       0
Name: 2019, dtype: int64

In [32]:
# MultiIndex df in terms of both cols and index
branch_df3=pd.DataFrame([
        [1,2,0,0],
        [3,4,0,0],
        [5,6,0,0],
        [7,8,9,0],
        [9,10,0,0],
        [11,12,0,0],
        [13,14,0,0],
        [15,16,9,0]
    ],
    index = pd.MultiIndex.from_product([['cse','ece'],[2019,2020,2021,2022]]),
    columns = pd.MultiIndex.from_product([['Delhi','Varanasi'],['Avg_Package','Students']])
)
branch_df3 # This is a 4D DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


#### Stacking and Unstacking

In [33]:
branch_df1 # 3D data represented in 2D

Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,3
cse,2020,2,4
cse,2021,5,7
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,12,13
ece,2022,15,16


In [34]:
branch_df1.unstack() # andar waale index ko column mein convert kar dega
# column mein multiindexing

Unnamed: 0_level_0,avg_package,avg_package,avg_package,avg_package,students,students,students,students
Unnamed: 0_level_1,2019,2020,2021,2022,2019,2020,2021,2022
cse,1,2,5,7,3,4,7,8
ece,9,11,12,15,10,12,13,16


In [35]:
branch_df1.unstack().unstack()

avg_package  2019  cse     1
                   ece     9
             2020  cse     2
                   ece    11
             2021  cse     5
                   ece    12
             2022  cse     7
                   ece    15
students     2019  cse     3
                   ece    10
             2020  cse     4
                   ece    12
             2021  cse     7
                   ece    13
             2022  cse     8
                   ece    16
dtype: int64

In [36]:
type(branch_df1.unstack().unstack())

pandas.core.series.Series

In [37]:
branch_df1.unstack().stack() # stack andar waala column -> row mein convert

  branch_df1.unstack().stack() # stack andar waala column -> row mein convert


Unnamed: 0,Unnamed: 1,avg_package,students
cse,2019,1,3
cse,2020,2,4
cse,2021,5,7
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,12,13
ece,2022,15,16


In [38]:
branch_df1.unstack().stack().stack()

  branch_df1.unstack().stack().stack()


cse  2019  avg_package     1
           students        3
     2020  avg_package     2
           students        4
     2021  avg_package     5
           students        7
     2022  avg_package     7
           students        8
ece  2019  avg_package     9
           students       10
     2020  avg_package    11
           students       12
     2021  avg_package    12
           students       13
     2022  avg_package    15
           students       16
dtype: int64

In [39]:
branch_df1.unstack().unstack()

avg_package  2019  cse     1
                   ece     9
             2020  cse     2
                   ece    11
             2021  cse     5
                   ece    12
             2022  cse     7
                   ece    15
students     2019  cse     3
                   ece    10
             2020  cse     4
                   ece    12
             2021  cse     7
                   ece    13
             2022  cse     8
                   ece    16
dtype: int64

In [40]:
branch_df1.unstack().unstack().unstack()

Unnamed: 0,Unnamed: 1,cse,ece
avg_package,2019,1,9
avg_package,2020,2,11
avg_package,2021,5,12
avg_package,2022,7,15
students,2019,3,10
students,2020,4,12
students,2021,7,13
students,2022,8,16


### Unstack: Row (andar waale) -> Column
### Stack : Column (andar waale) -> Row

In [41]:
branch_df1.unstack().unstack().unstack().unstack()

Unnamed: 0_level_0,cse,cse,cse,cse,ece,ece,ece,ece
Unnamed: 0_level_1,2019,2020,2021,2022,2019,2020,2021,2022
avg_package,1,2,5,7,9,11,12,15
students,3,4,7,8,10,12,13,16


In [42]:
branch_df2

Unnamed: 0_level_0,Delhi,Delhi,Mumbai,Mumbai
Unnamed: 0_level_1,Avg_Package,Students,Avg_Package,Students
2019,1,2,0,0
2020,3,4,0,0
2021,5,6,0,0
2022,7,8,9,0


In [43]:
branch_df2.stack()

  branch_df2.stack()


Unnamed: 0,Unnamed: 1,Delhi,Mumbai
2019,Avg_Package,1,0
2019,Students,2,0
2020,Avg_Package,3,0
2020,Students,4,0
2021,Avg_Package,5,0
2021,Students,6,0
2022,Avg_Package,7,9
2022,Students,8,0


In [44]:
branch_df2.unstack()

Delhi   Avg_Package  2019    1
                     2020    3
                     2021    5
                     2022    7
        Students     2019    2
                     2020    4
                     2021    6
                     2022    8
Mumbai  Avg_Package  2019    0
                     2020    0
                     2021    0
                     2022    9
        Students     2019    0
                     2020    0
                     2021    0
                     2022    0
dtype: int64

In [45]:
branch_df2

Unnamed: 0_level_0,Delhi,Delhi,Mumbai,Mumbai
Unnamed: 0_level_1,Avg_Package,Students,Avg_Package,Students
2019,1,2,0,0
2020,3,4,0,0
2021,5,6,0,0
2022,7,8,9,0


In [46]:
branch_df2.stack().stack()

  branch_df2.stack().stack()


2019  Avg_Package  Delhi     1
                   Mumbai    0
      Students     Delhi     2
                   Mumbai    0
2020  Avg_Package  Delhi     3
                   Mumbai    0
      Students     Delhi     4
                   Mumbai    0
2021  Avg_Package  Delhi     5
                   Mumbai    0
      Students     Delhi     6
                   Mumbai    0
2022  Avg_Package  Delhi     7
                   Mumbai    9
      Students     Delhi     8
                   Mumbai    0
dtype: int64

In [47]:
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [48]:
branch_df3.unstack()

Unnamed: 0_level_0,Delhi,Delhi,Delhi,Delhi,Delhi,Delhi,Delhi,Delhi,Varanasi,Varanasi,Varanasi,Varanasi,Varanasi,Varanasi,Varanasi,Varanasi
Unnamed: 0_level_1,Avg_Package,Avg_Package,Avg_Package,Avg_Package,Students,Students,Students,Students,Avg_Package,Avg_Package,Avg_Package,Avg_Package,Students,Students,Students,Students
Unnamed: 0_level_2,2019,2020,2021,2022,2019,2020,2021,2022,2019,2020,2021,2022,2019,2020,2021,2022
cse,1,3,5,7,2,4,6,8,0,0,0,9,0,0,0,0
ece,9,11,13,15,10,12,14,16,0,0,0,9,0,0,0,0


In [49]:
branch_df3.unstack().unstack() # 4D Series

Delhi     Avg_Package  2019  cse     1
                             ece     9
                       2020  cse     3
                             ece    11
                       2021  cse     5
                             ece    13
                       2022  cse     7
                             ece    15
          Students     2019  cse     2
                             ece    10
                       2020  cse     4
                             ece    12
                       2021  cse     6
                             ece    14
                       2022  cse     8
                             ece    16
Varanasi  Avg_Package  2019  cse     0
                             ece     0
                       2020  cse     0
                             ece     0
                       2021  cse     0
                             ece     0
                       2022  cse     9
                             ece     9
          Students     2019  cse     0
                         

In [50]:
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [51]:
branch_df3.stack()

  branch_df3.stack()


Unnamed: 0,Unnamed: 1,Unnamed: 2,Delhi,Varanasi
cse,2019,Avg_Package,1,0
cse,2019,Students,2,0
cse,2020,Avg_Package,3,0
cse,2020,Students,4,0
cse,2021,Avg_Package,5,0
cse,2021,Students,6,0
cse,2022,Avg_Package,7,9
cse,2022,Students,8,0
ece,2019,Avg_Package,9,0
ece,2019,Students,10,0


In [52]:
branch_df3.stack().stack()

  branch_df3.stack().stack()


cse  2019  Avg_Package  Delhi        1
                        Varanasi     0
           Students     Delhi        2
                        Varanasi     0
     2020  Avg_Package  Delhi        3
                        Varanasi     0
           Students     Delhi        4
                        Varanasi     0
     2021  Avg_Package  Delhi        5
                        Varanasi     0
           Students     Delhi        6
                        Varanasi     0
     2022  Avg_Package  Delhi        7
                        Varanasi     9
           Students     Delhi        8
                        Varanasi     0
ece  2019  Avg_Package  Delhi        9
                        Varanasi     0
           Students     Delhi       10
                        Varanasi     0
     2020  Avg_Package  Delhi       11
                        Varanasi     0
           Students     Delhi       12
                        Varanasi     0
     2021  Avg_Package  Delhi       13
                        V

In [53]:
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [54]:
branch_df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0


In [55]:
branch_df3.shape

(8, 4)

In [56]:
branch_df3.size

32

In [57]:
branch_df3.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8 entries, ('cse', 2019) to ('ece', 2022)
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   (Delhi, Avg_Package)     8 non-null      int64
 1   (Delhi, Students)        8 non-null      int64
 2   (Varanasi, Avg_Package)  8 non-null      int64
 3   (Varanasi, Students)     8 non-null      int64
dtypes: int64(4)
memory usage: 632.0+ bytes


In [58]:
branch_df3.unstack().info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, cse to ece
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   (Delhi, Avg_Package, 2019)     2 non-null      int64
 1   (Delhi, Avg_Package, 2020)     2 non-null      int64
 2   (Delhi, Avg_Package, 2021)     2 non-null      int64
 3   (Delhi, Avg_Package, 2022)     2 non-null      int64
 4   (Delhi, Students, 2019)        2 non-null      int64
 5   (Delhi, Students, 2020)        2 non-null      int64
 6   (Delhi, Students, 2021)        2 non-null      int64
 7   (Delhi, Students, 2022)        2 non-null      int64
 8   (Varanasi, Avg_Package, 2019)  2 non-null      int64
 9   (Varanasi, Avg_Package, 2020)  2 non-null      int64
 10  (Varanasi, Avg_Package, 2021)  2 non-null      int64
 11  (Varanasi, Avg_Package, 2022)  2 non-null      int64
 12  (Varanasi, Students, 2019)     2 non-null      int64
 13  (Varanasi, Students, 2020

In [59]:
branch_df3.duplicated()

cse  2019    False
     2020    False
     2021    False
     2022    False
ece  2019    False
     2020    False
     2021    False
     2022    False
dtype: bool

In [60]:
branch_df3.isnull()

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,False,False,False,False
cse,2020,False,False,False,False
cse,2021,False,False,False,False
cse,2022,False,False,False,False
ece,2019,False,False,False,False
ece,2020,False,False,False,False
ece,2021,False,False,False,False
ece,2022,False,False,False,False


In [62]:
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [63]:
branch_df3.loc[('cse',2022)]

Delhi     Avg_Package    7
          Students       8
Varanasi  Avg_Package    9
          Students       0
Name: (cse, 2022), dtype: int64

In [67]:
branch_df3.loc[('cse',2019):('ece',2020):2] # start:jahan_tk+1:jump

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2021,5,6,0,0
ece,2019,9,10,0,0


In [68]:
branch_df3.iloc[0]

Delhi     Avg_Package    1
          Students       2
Varanasi  Avg_Package    0
          Students       0
Name: (cse, 2019), dtype: int64

In [69]:
branch_df3.iloc[1]

Delhi     Avg_Package    3
          Students       4
Varanasi  Avg_Package    0
          Students       0
Name: (cse, 2020), dtype: int64

In [70]:
branch_df3.iloc[0:5:2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2021,5,6,0,0
ece,2019,9,10,0,0


In [71]:
# Extracting cols
branch_df3['Delhi']

Unnamed: 0,Unnamed: 1,Avg_Package,Students
cse,2019,1,2
cse,2020,3,4
cse,2021,5,6
cse,2022,7,8
ece,2019,9,10
ece,2020,11,12
ece,2021,13,14
ece,2022,15,16


In [72]:
branch_df3['Delhi']['Students']

cse  2019     2
     2020     4
     2021     6
     2022     8
ece  2019    10
     2020    12
     2021    14
     2022    16
Name: Students, dtype: int64

In [73]:
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [76]:
# Delhi ka Students and Varanasi ka Avg_Package
branch_df3.iloc[:,1:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Students,Avg_Package
cse,2019,2,0
cse,2020,4,0
cse,2021,6,0
cse,2022,8,9
ece,2019,10,0
ece,2020,12,0
ece,2021,14,0
ece,2022,16,9


In [79]:
# cse and ece ka first row and Delhi ka Students and Varanasi ka Avg_Package
branch_df3.iloc[[0,4],[1,2]] # .iloc[[rows],[columns]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Students,Avg_Package
cse,2019,2,0
ece,2019,10,0


In [80]:
# sort index
# both -> descending -> diff order
# based on one level
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [82]:
branch_df3.sort_index(ascending=False) # dono level par sorting hoga

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
ece,2022,15,16,9,0
ece,2021,13,14,0,0
ece,2020,11,12,0,0
ece,2019,9,10,0,0
cse,2022,7,8,9,0
cse,2021,5,6,0,0
cse,2020,3,4,0,0
cse,2019,1,2,0,0


In [83]:
branch_df3.sort_index(ascending=[False,True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0


In [85]:
branch_df3.sort_index(level=1, ascending=False) # level 0 = branch, level 1 = year

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
ece,2022,15,16,9,0
cse,2022,7,8,9,0
ece,2021,13,14,0,0
cse,2021,5,6,0,0
ece,2020,11,12,0,0
cse,2020,3,4,0,0
ece,2019,9,10,0,0
cse,2019,1,2,0,0


In [86]:
branch_df3.sort_index(level=0, ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
ece,2022,15,16,9,0
ece,2021,13,14,0,0
ece,2020,11,12,0,0
ece,2019,9,10,0,0
cse,2022,7,8,9,0
cse,2021,5,6,0,0
cse,2020,3,4,0,0
cse,2019,1,2,0,0


#### Transpose

In [88]:
branch_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


In [89]:
branch_df3.transpose()

Unnamed: 0_level_0,Unnamed: 1_level_0,cse,cse,cse,cse,ece,ece,ece,ece
Unnamed: 0_level_1,Unnamed: 1_level_1,2019,2020,2021,2022,2019,2020,2021,2022
Delhi,Avg_Package,1,3,5,7,9,11,13,15
Delhi,Students,2,4,6,8,10,12,14,16
Varanasi,Avg_Package,0,0,0,9,0,0,0,9
Varanasi,Students,0,0,0,0,0,0,0,0


#### Swaplevel

In [90]:
branch_df3.swaplevel()

Unnamed: 0_level_0,Unnamed: 1_level_0,Delhi,Delhi,Varanasi,Varanasi
Unnamed: 0_level_1,Unnamed: 1_level_1,Avg_Package,Students,Avg_Package,Students
2019,cse,1,2,0,0
2020,cse,3,4,0,0
2021,cse,5,6,0,0
2022,cse,7,8,9,0
2019,ece,9,10,0,0
2020,ece,11,12,0,0
2021,ece,13,14,0,0
2022,ece,15,16,9,0


In [91]:
branch_df3.swaplevel(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg_Package,Students,Avg_Package,Students
Unnamed: 0_level_1,Unnamed: 1_level_1,Delhi,Delhi,Varanasi,Varanasi
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,9,0
ece,2019,9,10,0,0
ece,2020,11,12,0,0
ece,2021,13,14,0,0
ece,2022,15,16,9,0


#### Long vs Wide data

![LW.png](LW.png)

##### - melt -> wide to long
##### - pivot -> long to wide

In [97]:
pd.DataFrame({'cse':[120]})

Unnamed: 0,cse
0,120


In [100]:
pd.DataFrame({'cse':[120]}).melt() # column became row

Unnamed: 0,variable,value
0,cse,120


In [104]:
pd.DataFrame({'cse':[120],'ece':[100],'mech':[50]}) # Wide Data format

Unnamed: 0,cse,ece,mech
0,120,100,50


In [105]:
pd.DataFrame({'cse':[120],'ece':[100],'mech':[50]}).melt()

Unnamed: 0,variable,value
0,cse,120
1,ece,100
2,mech,50


In [106]:
pd.DataFrame({'cse':[120],'ece':[100],'mech':[50]}).melt(var_name='branch',value_name='num_students')

Unnamed: 0,branch,num_students
0,cse,120
1,ece,100
2,mech,50


In [108]:
pd.DataFrame({
    'branch':['cse','ece','mech'],
    '2020':[300,400,20],
    '2021':[900,300,20],
    '2022':[2000,350,30]
}) # Wide data format

Unnamed: 0,branch,2020,2021,2022
0,cse,300,900,2000
1,ece,400,300,350
2,mech,20,20,30


In [110]:
# long data format
pd.DataFrame({
    'branch':['cse','ece','mech'],
    '2020':[300,400,20],
    '2021':[900,300,20],
    '2022':[2000,350,30]
}).melt()  # column became row

Unnamed: 0,variable,value
0,branch,cse
1,branch,ece
2,branch,mech
3,2020,300
4,2020,400
5,2020,20
6,2021,900
7,2021,300
8,2021,20
9,2022,2000


In [112]:
# But don't want all columns to become row
pd.DataFrame({
    'branch':['cse','ece','mech'],
    '2020':[300,400,20],
    '2021':[900,300,20],
    '2022':[2000,350,30]
}).melt(id_vars=['branch']) # id_vars=<jisko row nahi banana>

Unnamed: 0,branch,variable,value
0,cse,2020,300
1,ece,2020,400
2,mech,2020,20
3,cse,2021,900
4,ece,2021,300
5,mech,2021,20
6,cse,2022,2000
7,ece,2022,350
8,mech,2022,30


In [113]:
pd.DataFrame({
    'branch':['cse','ece','mech'],
    '2020':[300,400,20],
    '2021':[900,300,20],
    '2022':[2000,350,30]
}).melt(id_vars=['branch'],var_name='year',value_name='students')

Unnamed: 0,branch,year,students
0,cse,2020,300
1,ece,2020,400
2,mech,2020,20
3,cse,2021,900
4,ece,2021,300
5,mech,2021,20
6,cse,2022,2000
7,ece,2022,350
8,mech,2022,30


In [117]:
death=pd.read_csv('time_series_covid19_deaths_global.csv')

In [118]:
confirm=pd.read_csv('time_series_covid19_confirmed_global.csv')

In [119]:
death.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/24/22,12/25/22,12/26/22,12/27/22,12/28/22,12/29/22,12/30/22,12/31/22,1/1/23,1/2/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7845,7846,7846,7846,7846,7847,7847,7849,7849,7849
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3595,3595,3595,3595,3595,3595,3595,3595,3595,3595
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,6881,6881,6881,6881,6881,6881,6881,6881,6881,6881
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,165,165,165,165,165,165,165,165,165,165
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1928,1928,1928,1930,1930,1930,1930,1930,1930,1930


In [121]:
death.shape # wide format as each country has only one row

(289, 1081)

In [123]:
confirm.head() # this is also in wide format

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/24/22,12/25/22,12/26/22,12/27/22,12/28/22,12/29/22,12/30/22,12/31/22,1/1/23,1/2/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,207310,207399,207438,207460,207493,207511,207550,207559,207616,207627
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,333749,333749,333751,333751,333776,333776,333806,333806,333811,333812
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271194,271198,271198,271202,271208,271217,271223,271228,271229,271229
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47686,47686,47686,47686,47751,47751,47751,47751,47751,47751
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,104973,104973,104973,105095,105095,105095,105095,105095,105095,105095


In [125]:
# country -> date -> confirm -> death : country will repeat as on each date there will be some data

In [127]:
death.head(2)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/24/22,12/25/22,12/26/22,12/27/22,12/28/22,12/29/22,12/30/22,12/31/22,1/1/23,1/2/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7845,7846,7846,7846,7846,7847,7847,7849,7849,7849
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3595,3595,3595,3595,3595,3595,3595,3595,3595,3595


In [131]:
death = death.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='date',value_name='num_deaths')

In [132]:
death.melt(id_vars=['Province/State','Country/Region','Lat','Long']).shape

(622506, 6)

In [133]:
confirm = confirm.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='date',value_name='num_cases')

In [134]:
confirm.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,num_cases
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [135]:
death.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,num_deaths
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [136]:
confirm.merge(death, how='inner',on=['Province/State','Country/Region','Lat','Long','date'])

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,num_cases,num_deaths
0,,Afghanistan,33.939110,67.709953,1/22/20,0,0
1,,Albania,41.153300,20.168300,1/22/20,0,0
2,,Algeria,28.033900,1.659600,1/22/20,0,0
3,,Andorra,42.506300,1.521800,1/22/20,0,0
4,,Angola,-11.202700,17.873900,1/22/20,0,0
...,...,...,...,...,...,...,...
311248,,West Bank and Gaza,31.952200,35.233200,1/2/23,703228,5708
311249,,Winter Olympics 2022,39.904200,116.407400,1/2/23,535,0
311250,,Yemen,15.552727,48.516388,1/2/23,11945,2159
311251,,Zambia,-13.133897,27.849332,1/2/23,334661,4024


In [137]:
confirm.merge(death, how='inner',on=['Province/State','Country/Region','Lat','Long','date'])[['Country/Region','date','num_deaths','num_cases']]

Unnamed: 0,Country/Region,date,num_deaths,num_cases
0,Afghanistan,1/22/20,0,0
1,Albania,1/22/20,0,0
2,Algeria,1/22/20,0,0
3,Andorra,1/22/20,0,0
4,Angola,1/22/20,0,0
...,...,...,...,...
311248,West Bank and Gaza,1/2/23,5708,703228
311249,Winter Olympics 2022,1/2/23,0,535
311250,Yemen,1/2/23,2159,11945
311251,Zambia,1/2/23,4024,334661
