# Pandas Fundamentals: Dataframes

This notebook contains examples of fundamental Dataframe operations.



In [1]:
# Install required dependencies
#!pip install pandas pyarrow

In [2]:
import pandas as pd
from pandas.testing import assert_frame_equal

# Sample Dictionaries
person_dict = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'San Francisco', 'Seattle']
}

animals_dict = {
    'City': ['New York', 'San Francisco', 'Seattle', 'New York', 'San Francisco', 'Seattle', 'New York',
             'San Francisco', 'Seattle'],
    'Pigeons': [250000, 300000, 350000, 350000, 400000, 550000, 310000, 410000, 510000],
    'Rats': [1250000, 1300000, 1350000, 2350000, 2400000, 2550000, 3310000, 3410000, 3510000],
    'Year': ['1998', '1998', '1998', '1999', '1999', '1999', '2000', '2000', '2000']
}

# Globals
file_prefix = '01_dataframes'

## 1. Store and Load

---

### 1.1. Parquet

In [3]:
parquet_file = f'{file_prefix}.parquet'

# Write
persons_df = pd.DataFrame(person_dict)
persons_df.to_parquet(parquet_file)

# Read
parquet_df = pd.read_parquet(parquet_file)
parquet_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [4]:
# Assert data frames are equal
assert_frame_equal(persons_df, parquet_df)

---

### 1.2. JSON

In [5]:
json_file = f'{file_prefix}.json'

# Write
persons_df = pd.DataFrame(person_dict)
persons_df.to_json(json_file, orient='records')

# Read
json_df = pd.read_json(json_file)
json_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [6]:
# Assert data frames are equal
assert_frame_equal(persons_df, json_df)

---

### 1.3. CSV

In [7]:
csv_file = f'{file_prefix}.csv'

# Write
persons_df = pd.DataFrame(person_dict)
persons_df.to_csv(csv_file, index=False)

# Read
csv_df = pd.read_csv(csv_file)
csv_df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [8]:
# Assert data frames are equal
assert_frame_equal(persons_df, csv_df)

## 2. Filter Row Data

---

### 2.1. *iloc* property

In [9]:
# filter lines
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [10]:
us_west_df = persons_df.iloc[1:]
us_west_df

Unnamed: 0,Name,Age,City
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [11]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [12]:
us_north_df = pd.concat([persons_df.iloc[:1], persons_df.iloc[2:]], ignore_index=True)
us_north_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Charlie,35,Seattle


In [13]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [14]:
young_anonymous_df = persons_df.iloc[:2, 1:3]
young_anonymous_df

Unnamed: 0,Age,City
0,25,New York
1,30,San Francisco


---

### 2.2. Conditionals

In [15]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [16]:
not_silicon_valley_df = persons_df[persons_df['City'] != 'San Francisco']
not_silicon_valley_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Seattle


## 3. Transformations

---

### 3.1. *drop* method

In [17]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [18]:
anonymous_df = persons_df.drop('Name', axis=1)
anonymous_df

Unnamed: 0,Age,City
0,25,New York
1,30,San Francisco
2,35,Seattle


In [19]:
animals_df = pd.DataFrame(animals_dict)
animals_df

Unnamed: 0,City,Pigeons,Rats,Year
0,New York,250000,1250000,1998
1,San Francisco,300000,1300000,1998
2,Seattle,350000,1350000,1998
3,New York,350000,2350000,1999
4,San Francisco,400000,2400000,1999
5,Seattle,550000,2550000,1999
6,New York,310000,3310000,2000
7,San Francisco,410000,3410000,2000
8,Seattle,510000,3510000,2000


In [20]:
city_df = animals_df.drop(columns=['Rats', 'Pigeons'])
city_df

Unnamed: 0,City,Year
0,New York,1998
1,San Francisco,1998
2,Seattle,1998
3,New York,1999
4,San Francisco,1999
5,Seattle,1999
6,New York,2000
7,San Francisco,2000
8,Seattle,2000


---
### 3.2. Append columns

In [21]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [22]:
occupation_df = pd.DataFrame({
    'Occupation': ['Lawyer', 'Engineer', 'Musician']
})
persons_df['Occupation'] = occupation_df['Occupation']
persons_df

Unnamed: 0,Name,Age,City,Occupation
0,Alice,25,New York,Lawyer
1,Bob,30,San Francisco,Engineer
2,Charlie,35,Seattle,Musician


---
### 3.3. *apply* method

In [23]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [24]:
persons_df['Age Group'] = persons_df['Age'].apply(lambda age: 'Young' if age < 30 else 'Adult')
persons_df

Unnamed: 0,Name,Age,City,Age Group
0,Alice,25,New York,Young
1,Bob,30,San Francisco,Adult
2,Charlie,35,Seattle,Adult


---
### 3.4. Re-arrange columns

In [25]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [26]:
city_df = persons_df[['City', 'Name', 'Age']]
city_df

Unnamed: 0,City,Name,Age
0,New York,Alice,25
1,San Francisco,Bob,30
2,Seattle,Charlie,35


---
### 3.5. *groupby* method


In [27]:
animals_df = pd.DataFrame(animals_dict)
animals_df

Unnamed: 0,City,Pigeons,Rats,Year
0,New York,250000,1250000,1998
1,San Francisco,300000,1300000,1998
2,Seattle,350000,1350000,1998
3,New York,350000,2350000,1999
4,San Francisco,400000,2400000,1999
5,Seattle,550000,2550000,1999
6,New York,310000,3310000,2000
7,San Francisco,410000,3410000,2000
8,Seattle,510000,3510000,2000


In [28]:
count_df = animals_df.drop('Year', axis=1).groupby('City').sum()
count_df

Unnamed: 0_level_0,Pigeons,Rats
City,Unnamed: 1_level_1,Unnamed: 2_level_1
New York,910000,6910000
San Francisco,1110000,7110000
Seattle,1410000,7410000


---
### 3.6. *merge* method

In [29]:
persons_df = pd.DataFrame(person_dict).drop('Age', axis=1)
persons_df

Unnamed: 0,Name,City
0,Alice,New York
1,Bob,San Francisco
2,Charlie,Seattle


In [30]:
animals_df = pd.DataFrame(animals_dict).drop('Year', axis=1).groupby('City').sum()
animals_df

Unnamed: 0_level_0,Pigeons,Rats
City,Unnamed: 1_level_1,Unnamed: 2_level_1
New York,910000,6910000
San Francisco,1110000,7110000
Seattle,1410000,7410000


In [31]:
aggregated_df = pd.merge(persons_df, animals_df, on='City', how='inner')
aggregated_df = aggregated_df[['City', 'Name', 'Pigeons', 'Rats']]
aggregated_df

Unnamed: 0,City,Name,Pigeons,Rats
0,New York,Alice,910000,6910000
1,San Francisco,Bob,1110000,7110000
2,Seattle,Charlie,1410000,7410000


---

### 3.7. *fillna* method

In [32]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [33]:
persons_df['Subscriber'] = [None] * len(person_dict['Name'])
persons_df

Unnamed: 0,Name,Age,City,Subscriber
0,Alice,25,New York,
1,Bob,30,San Francisco,
2,Charlie,35,Seattle,


In [34]:
persons_df.fillna(False, inplace=True)
persons_df

Unnamed: 0,Name,Age,City,Subscriber
0,Alice,25,New York,False
1,Bob,30,San Francisco,False
2,Charlie,35,Seattle,False


---

### 3.8. *sort_values* method

In [35]:
persons_df = pd.DataFrame(person_dict)
persons_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [36]:
sorted_df = persons_df.sort_values(by='Age', ascending=False)
sorted_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Seattle
1,Bob,30,San Francisco
0,Alice,25,New York


## 4. Analysis Operations

---

### 4.1. *describe* method

In [37]:
animals_df = pd.DataFrame(animals_dict)
animals_df

Unnamed: 0,City,Pigeons,Rats,Year
0,New York,250000,1250000,1998
1,San Francisco,300000,1300000,1998
2,Seattle,350000,1350000,1998
3,New York,350000,2350000,1999
4,San Francisco,400000,2400000,1999
5,Seattle,550000,2550000,1999
6,New York,310000,3310000,2000
7,San Francisco,410000,3410000,2000
8,Seattle,510000,3510000,2000


In [38]:
animals_df.describe()

Unnamed: 0,Pigeons,Rats
count,9.0,9.0
mean,381111.111111,2381111.0
std,98163.695484,917679.7
min,250000.0,1250000.0
25%,310000.0,1350000.0
50%,350000.0,2400000.0
75%,410000.0,3310000.0
max,550000.0,3510000.0


---

### 4.2. *pivot_table* method

In [39]:
animals_df = pd.DataFrame(animals_dict)
animals_df

Unnamed: 0,City,Pigeons,Rats,Year
0,New York,250000,1250000,1998
1,San Francisco,300000,1300000,1998
2,Seattle,350000,1350000,1998
3,New York,350000,2350000,1999
4,San Francisco,400000,2400000,1999
5,Seattle,550000,2550000,1999
6,New York,310000,3310000,2000
7,San Francisco,410000,3410000,2000
8,Seattle,510000,3510000,2000


In [40]:
rats_df = animals_df.drop('Pigeons', axis=1)
rats_df

Unnamed: 0,City,Rats,Year
0,New York,1250000,1998
1,San Francisco,1300000,1998
2,Seattle,1350000,1998
3,New York,2350000,1999
4,San Francisco,2400000,1999
5,Seattle,2550000,1999
6,New York,3310000,2000
7,San Francisco,3410000,2000
8,Seattle,3510000,2000


In [41]:
rats_city_aggregation_df = rats_df.pivot_table(index='City', values='Rats', aggfunc='sum')
rats_city_aggregation_df

Unnamed: 0_level_0,Rats
City,Unnamed: 1_level_1
New York,6910000
San Francisco,7110000
Seattle,7410000


In [42]:
rats_year_aggregation_df = rats_df.pivot_table(index='Year', values='Rats', aggfunc='sum')
rats_year_aggregation_df

Unnamed: 0_level_0,Rats
Year,Unnamed: 1_level_1
1998,3900000
1999,7300000
2000,10230000


In [43]:
animals_city_aggregation_df = animals_df.pivot_table(index='City', values=['Rats','Pigeons'], aggfunc='sum')
animals_city_aggregation_df

Unnamed: 0_level_0,Pigeons,Rats
City,Unnamed: 1_level_1,Unnamed: 2_level_1
New York,910000,6910000
San Francisco,1110000,7110000
Seattle,1410000,7410000


---

### 4.3. *value_counts* method

In [44]:
animals_df = pd.DataFrame(animals_dict)
animals_df

Unnamed: 0,City,Pigeons,Rats,Year
0,New York,250000,1250000,1998
1,San Francisco,300000,1300000,1998
2,Seattle,350000,1350000,1998
3,New York,350000,2350000,1999
4,San Francisco,400000,2400000,1999
5,Seattle,550000,2550000,1999
6,New York,310000,3310000,2000
7,San Francisco,410000,3410000,2000
8,Seattle,510000,3510000,2000


In [45]:
city_counts = animals_df['City'].value_counts()
city_counts

City
New York         3
San Francisco    3
Seattle          3
Name: count, dtype: int64