<a href="https://colab.research.google.com/github/bradleyboehmke/uc-bana-4080/blob/main/example-notebooks/09_manipulating_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter: Manipulating Data
This notebook accompanies the BANA 4080 textbook chapter on manipulating data using pandas.

## Loading the Ames Housing Data

In [None]:
import pandas as pd

data_url = 'https://raw.githubusercontent.com/bradleyboehmke/uc-bana-4080/refs/heads/main/data/ames_raw.csv'
ames = pd.read_csv(data_url)
ames.head()

## Renaming Columns
Standardizing column names using `.rename()` and `.str` string methods.

In [None]:
ames = ames.rename(columns={'MS SubClass': 'ms_subclass', 'MS Zoning': 'ms_zoning'})
ames.head()

In [None]:
ames.columns = ames.columns.str.lower().str.replace(' ', '_')
ames.head()

## Performing Calculations with Columns
Creating new columns using scalar and vector operations.

In [None]:
ames['sale_price_k'] = ames['saleprice'] / 1000
ames.head()

In [None]:
(ames['saleprice'] - 12).head()

In [None]:
(ames['saleprice'] * 10).head()

In [None]:
(ames['saleprice'] ** 2).head()

## Removing Columns

In [None]:
ames = ames.drop(columns=['order', 'sale_price_k'])
ames.head()

## Calculating with Multiple Columns

In [None]:
ames['price_per_sqft'] = ames['saleprice'] / ames['gr_liv_area']
ames.head()

In [None]:
ames['nonsense'] = (ames['yr_sold'] + 12) * ames['gr_liv_area'] + ames['lot_area'] - 50
ames.head()

## Working with Non-Numeric (String) Columns

In [None]:
'Home in ' + ames['neighborhood'] + ' neighborhood sold under ' + ames['sale_condition'] + ' condition'

In [None]:
ames['neighborhood'].str.len()

In [None]:
ames['garage_type'].str.lower().str.replace('tchd', 'tached')

## Replacing Values Using a Mapping

In [None]:
value_mapping = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

In [None]:
ames['mo_sold'] = ames['mo_sold'].replace(value_mapping)
ames['mo_sold'].head()

## Handling Missing Values

In [None]:
ames.isnull().sum()

In [None]:
ames.info()

In [None]:
missing = ames.isnull().any()
ames[missing[missing].index]

## Visualizing Missing Data

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(12, 8)})

In [None]:
ames_missing = ames[missing[missing].index]
sns.heatmap(ames_missing.isnull(), cmap='viridis', cbar=False);

## Imputing Missing Values

In [None]:
import numpy as np

df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                  columns=list('ABCD'))
df

In [None]:
df.fillna(0)

In [None]:
df.fillna(df.mean())

In [None]:
df.bfill()

In [None]:
df.ffill()

## Applying Custom Functions

In [None]:
def is_luxury_home(x):
    return 'Luxury' if x > 500000 else 'Non-luxury'

ames['saleprice'].apply(is_luxury_home)

In [None]:
ames['saleprice'].apply(lambda x: 'Luxury' if x > 500000 else 'Non-luxury')

In [None]:
def is_luxury_home(x, price):
    return 'Luxury' if x > price else 'Non-luxury'

ames['saleprice'].apply(is_luxury_home, price=200000)

In [None]:
def convert_to_sq_meters(x):
    return x * 0.092903

ames[['gr_liv_area', 'garage_area', 'lot_area']].map(convert_to_sq_meters)