# Pandas Introduction

In [1]:
# Import Pandas library
import pandas as pd

## Basic dataframe manipulation

In [2]:
# First, we create a dictionary with the data we want to use, by passsing a dictionary to the DataFrame constructor
df = pd.DataFrame({'A': [1, 2, 3, 4, 5],
                     'B': [10, 20, 30, 40, 50],
                     'C': [100, 200, 300, 400, 500]})

df

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [3]:
# Let's add a new column to the DataFrame
df['D'] = df['A'] + df['B'] + df['C']
df

Unnamed: 0,A,B,C,D
0,1,10,100,111
1,2,20,200,222
2,3,30,300,333
3,4,40,400,444
4,5,50,500,555


In [4]:
# Adding a date index to the DataFrame
df.index = pd.date_range('2022-10-01', periods=5, freq='D')
df

Unnamed: 0,A,B,C,D
2022-10-01,1,10,100,111
2022-10-02,2,20,200,222
2022-10-03,3,30,300,333
2022-10-04,4,40,400,444
2022-10-05,5,50,500,555


## Basic operations and built-in functions

In [5]:
# Different ways to retrieve the mean of the dataframe

# Let's retrieve the mean of all the columns. To take the mean for each column we have to iterate over each rows and retrieve the values for each column. Then we can calculate the mean for each column. That is why we have to use the axis=0 or rows parameter. As we want to visit each row for each column.

# print("The mean of the dataframe is: ", df.mean())
# print("The mean of the dataframe is: ", df.mean(axis=0))
# print("The mean of the dataframe is: ", df.mean(axis='index'))
print("The mean of the dataframe is: ", df.mean(axis='rows'))


# Let's retrieve the mean of all the rows. To take the mean for each row we have to iterate over each column and retrieve the values for each row. Then we can calculate the mean for each row. That is why we have to use the axis=1 or columns parameter. As we want to visit each column for each row.

print("The mean of the columns is: ", df.mean(axis=1))
# print("The mean of the columns is: ", df.mean(axis='columns'))

# So always remember that the axis parameter is used to specify the direction of the iteration. If we want to iterate over the rows we have to use axis=0 or axis='index' or axis='rows'. If we want to iterate over the columns we have to use axis=1 or axis='columns'.

The mean of the dataframe is:  A      3.0
B     30.0
C    300.0
D    333.0
dtype: float64
The mean of the columns is:  2022-10-01     55.5
2022-10-02    111.0
2022-10-03    166.5
2022-10-04    222.0
2022-10-05    277.5
Freq: D, dtype: float64


In [6]:
# The same applies for every other function builtin function
df.max(axis=1)

2022-10-01    111
2022-10-02    222
2022-10-03    333
2022-10-04    444
2022-10-05    555
Freq: D, dtype: int64

## Slice the dataframe, indexing and selecting data

In [7]:
# lets see all the different ways to slice a dataframe

# # Lets retrieve the first 3 rows
df[:3]

# Lets retrieve the last 3 rows
df[-3:]

# Lets retrieve the first 3 columns
df.iloc[:, :3]

# # Lets retrieve the last 3 columns
df.iloc[:, -3:]

# # Lets retrieve the first 3 rows and the first 3 columns
df.iloc[:3, :3]

Unnamed: 0,A,B,C
2022-10-01,1,10,100
2022-10-02,2,20,200
2022-10-03,3,30,300


## Extra functionality

In [8]:
# Color the cells with the highest value in each column
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

df.style.apply(highlight_max)


Unnamed: 0,A,B,C,D
2022-10-01 00:00:00,1,10,100,111
2022-10-02 00:00:00,2,20,200,222
2022-10-03 00:00:00,3,30,300,333
2022-10-04 00:00:00,4,40,400,444
2022-10-05 00:00:00,5,50,500,555


In [9]:
# Color the cells with the highest value in each row
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

df.style.apply(highlight_max, axis=1)


Unnamed: 0,A,B,C,D
2022-10-01 00:00:00,1,10,100,111
2022-10-02 00:00:00,2,20,200,222
2022-10-03 00:00:00,3,30,300,333
2022-10-04 00:00:00,4,40,400,444
2022-10-05 00:00:00,5,50,500,555


# A first example

In [10]:
import pandas as pd

In [11]:
# read the data from the csv file
df = pd.read_csv('datasets/iris.csv')

In [12]:
# Let's see the first 5 rows
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
# Let's see the last 5 rows
df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [15]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [16]:
# Drop the id column
df.drop('Id', axis=1, inplace=True)

In [17]:
# Find the mean for each species
df.groupby('Species').mean()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
Iris-virginica,6.588,2.974,5.552,2.026


In [18]:
# Find the mean for each species and each column
df.groupby('Species').mean().style.apply(highlight_max)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
Iris-virginica,6.588,2.974,5.552,2.026


In [19]:
# Mark all the cells with mean smaller than the global mean
def highlight_mean(s):
    is_mean = s < s.mean()
    return ['background-color: orange' if v else '' for v in is_mean]

df.groupby('Species').mean().style.apply(highlight_mean)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
Iris-virginica,6.588,2.974,5.552,2.026
