In [3]:
import pandas as pd
from typing import List

In [2]:
pd.read_excel('./participants.xlsx')

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


In [4]:
# create a DataFrame from scratch
data: List[list] = [['Mark', 55, 'Italy', 4.5, 'Europe'],
                    ['John', 33, 'USA', 6.7, 'America'],
                    ['Tim', 41, 'USA', 3.9, 'America'],
                    ['Jenny', 12, 'Germany', 9.0, 'Europe']]

df: pd.DataFrame = pd.DataFrame(data=data,
                                columns=['name', 'age', 'country', 'score', 'continent'],
                                index=[1001, 1000, 1002, 1003])

df

Unnamed: 0,name,age,country,score,continent
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 1001 to 1003
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       4 non-null      object 
 1   age        4 non-null      int64  
 2   country    4 non-null      object 
 3   score      4 non-null      float64
 4   continent  4 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 192.0+ bytes


# Index

In [6]:
df.index

Index([1001, 1000, 1002, 1003], dtype='int64')

In [7]:
df.index.name = 'user_id'
df

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


## "reset_index" turns the index into a column, replacing the index with the default index.

In [8]:
df.reset_index()

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


## "reset_index" turns "user_id" into a regular column and "set_index" turns the column "name" into the index

In [9]:
df.reset_index().set_index("name")

Unnamed: 0_level_0,user_id,age,country,score,continent
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mark,1001,55,Italy,4.5,Europe
John,1000,33,USA,6.7,America
Tim,1002,41,USA,3.9,America
Jenny,1003,12,Germany,9.0,Europe


## "reindex" changes the values of index

In [10]:
df.reindex([999, 1000, 1001, 1003])

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
999,,,,,
1000,John,33.0,USA,6.7,America
1001,Mark,55.0,Italy,4.5,Europe
1003,Jenny,12.0,Germany,9.0,Europe


### sorting index

In [12]:
df.sort_index()

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,John,33,USA,6.7,America
1001,Mark,55,Italy,4.5,Europe
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [13]:
# original order of the index
df

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


### sorting rows by multiple columns

In [14]:
df.sort_values(['continent', 'age'])

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe
1001,Mark,55,Italy,4.5,Europe


### sorting rows by single column

In [15]:
df.sort_values(['continent'])

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1001,Mark,55,Italy,4.5,Europe
1003,Jenny,12,Germany,9.0,Europe


# Column

In [17]:
# Get the columns of a DataFrame
df.columns

Index(['name', 'age', 'country', 'score', 'continent'], dtype='object')

## Give a name to the set of columns

In [18]:
df.columns.name = 'properties'
df

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


### rename columns

In [19]:
df.rename(columns={'name': 'First Name', 'age': 'Age'})

properties,First Name,Age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


## Delete particular columns and rows

In [20]:
df.drop(columns=['name', 'country'], index=[1000, 1003])

properties,age,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,55,4.5,Europe
1002,41,3.9,America


## Transposition of the DataFrame

In [21]:
df.T

user_id,1001,1000,1002,1003
properties,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
name,Mark,John,Tim,Jenny
age,55,33,41,12
country,Italy,USA,USA,Germany
score,4.5,6.7,3.9,9.0
continent,Europe,America,America,Europe


In [22]:
df.transpose()

user_id,1001,1000,1002,1003
properties,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
name,Mark,John,Tim,Jenny
age,55,33,41,12
country,Italy,USA,USA,Germany
score,4.5,6.7,3.9,9.0
continent,Europe,America,America,Europe


## Reorder the columns

In [23]:
df.loc[:, ['continent', 'country', 'name', 'age', 'score']]

properties,continent,country,name,age,score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Europe,Italy,Mark,55,4.5
1000,America,USA,John,33,6.7
1002,America,USA,Tim,41,3.9
1003,Europe,Germany,Jenny,12,9.0


## Selecting data by label
#### df.loc[row_selection, column_selection]

#### slicing in DataFrame selection includes upperbound

In [24]:
df.loc[1001, 'name']

'Mark'

In [25]:
df.loc[[1001, 1002], 'age']

user_id
1001    55
1002    41
Name: age, dtype: int64

In [27]:
df.loc[:1002, ['name', 'country']]

properties,name,country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,Mark,Italy
1000,John,USA
1002,Tim,USA
