# Pandas Fundamentals: Dataframes

This notebook contains examples of fundamental Dataframe operations.



In [1]:
# Install required dependencies
#!pip install pandas pyarrow

In [2]:
import pandas as pd
from pandas.testing import assert_frame_equal

# Sample Dictionaries
person_dict = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'San Francisco', 'Seattle']
}

animals_dict = {
    'City': ['New York', 'San Francisco', 'Seattle', 'New York', 'San Francisco', 'Seattle', 'New York',
             'San Francisco', 'Seattle'],
    'Pigeons': [250000, 300000, 350000, 350000, 400000, 550000, 310000, 410000, 510000],
    'Rats': [1250000, 1300000, 1350000, 2350000, 2400000, 2550000, 3310000, 3410000, 3510000],
    'Year': ['1998', '1998', '1998', '1999', '1999', '1999', '2000', '2000', '2000']
}

# Globals
file_prefix = '01_dataframes'

## 1. Store and Load

---

### 1.1. Parquet

In [3]:
parquet_file = f'{file_prefix}.parquet'

# Write
persons_df = pd.DataFrame(person_dict)
persons_df.to_parquet(parquet_file)

# Read
parquet_df = pd.read_parquet(parquet_file)
parquet_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [4]:
# Assert data frames are equal
assert_frame_equal(persons_df, parquet_df)

---

### 1.2. JSON

In [5]:
json_file = f'{file_prefix}.json'

# Write
persons_df = pd.DataFrame(person_dict)
persons_df.to_json(json_file, orient='records')

# Read
json_df = pd.read_json(json_file)
json_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [6]:
# Assert data frames are equal
assert_frame_equal(persons_df, json_df)

---

### 1.3. CSV

In [7]:
csv_file = f'{file_prefix}.csv'

# Write
persons_df = pd.DataFrame(person_dict)
persons_df.to_csv(csv_file, index=False)

# Read
csv_df = pd.read_csv(csv_file)
csv_df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [8]:
# Assert data frames are equal
assert_frame_equal(persons_df, csv_df)

## 2. Filter Row Data

---

### 2.1. *iloc* property

In [9]:
# filter lines
persons_df = pd.DataFrame(person_dict)
us_west_df = persons_df.iloc[1:]
us_west_df

Unnamed: 0,Name,Age,City
1,Bob,30,San Francisco
2,Charlie,35,Seattle


In [10]:
persons_df = pd.DataFrame(person_dict)
us_north_df = pd.concat([persons_df.iloc[:1], persons_df.iloc[2:]], ignore_index=True)
us_north_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Charlie,35,Seattle


In [11]:
persons_df = pd.DataFrame(person_dict)
young_anonymous_df = persons_df.iloc[:2, 1:3]
young_anonymous_df

Unnamed: 0,Age,City
0,25,New York
1,30,San Francisco


---

### 2.2. Conditionals

In [12]:
persons_df = pd.DataFrame(person_dict)
outside_silicon_valley_df = persons_df[persons_df['City'] != 'San Francisco']
outside_silicon_valley_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Seattle


## 3. Column Transformations

---

### 3.1. Drop columns

In [13]:
persons_df = pd.DataFrame(person_dict)
anonymous_df = persons_df.drop('Name', axis=1)
anonymous_df

Unnamed: 0,Age,City
0,25,New York
1,30,San Francisco
2,35,Seattle


In [14]:
animals_df = pd.DataFrame(animals_dict)
city_df = animals_df.drop(columns=['Rats','Pigeons'])
city_df

Unnamed: 0,City,Year
0,New York,1998
1,San Francisco,1998
2,Seattle,1998
3,New York,1999
4,San Francisco,1999
5,Seattle,1999
6,New York,2000
7,San Francisco,2000
8,Seattle,2000


---
### 3.2. Append columns

In [15]:
persons_df = pd.DataFrame(person_dict)
occupation_df = pd.DataFrame({
    'Occupation': ['Lawyer', 'Engineer', 'Musician']
})
persons_df['Occupation'] = occupation_df['Occupation']
persons_df

Unnamed: 0,Name,Age,City,Occupation
0,Alice,25,New York,Lawyer
1,Bob,30,San Francisco,Engineer
2,Charlie,35,Seattle,Musician


---
### 3.3. *apply* method

In [16]:
persons_df = pd.DataFrame(person_dict)
persons_df['Age Group'] = persons_df['Age'].apply(lambda age: 'Young' if age < 30 else 'Adult')
persons_df

Unnamed: 0,Name,Age,City,Age Group
0,Alice,25,New York,Young
1,Bob,30,San Francisco,Adult
2,Charlie,35,Seattle,Adult


---
### 3.4. Re-arrange columns

In [17]:
persons_df = pd.DataFrame(person_dict)
city_df = persons_df[['City', 'Name', 'Age']]
city_df

Unnamed: 0,City,Name,Age
0,New York,Alice,25
1,San Francisco,Bob,30
2,Seattle,Charlie,35
