# Level 6: Data Transformation

Once your data is clean, the next step is often to transform it. This can involve adding or removing columns, applying functions to your data, or reshaping it for analysis.

In [1]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Age': [25, 30, 35, 40, 22, 45],
    'City': ['NY', 'LA', 'Chicago', 'NY', 'LA', 'Chicago'],
    'Score1': [88, 92, 78, 85, 95, 62],
    'Score2': [91, 89, 82, 88, 98, 71]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Score1,Score2
0,Alice,25,NY,88,91
1,Bob,30,LA,92,89
2,Charlie,35,Chicago,78,82
3,David,40,NY,85,88
4,Eva,22,LA,95,98
5,Frank,45,Chicago,62,71


## 6.1 Adding & Removing Columns

### Adding a New Column

In [2]:
# Add a column with a constant value
df['Country'] = 'USA'
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country
0,Alice,25,NY,88,91,USA
1,Bob,30,LA,92,89,USA
2,Charlie,35,Chicago,78,82,USA
3,David,40,NY,85,88,USA
4,Eva,22,LA,95,98,USA
5,Frank,45,Chicago,62,71,USA


In [3]:
# Add a column based on existing columns
df['AvgScore'] = (df['Score1'] + df['Score2']) / 2
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore
0,Alice,25,NY,88,91,USA,89.5
1,Bob,30,LA,92,89,USA,90.5
2,Charlie,35,Chicago,78,82,USA,80.0
3,David,40,NY,85,88,USA,86.5
4,Eva,22,LA,95,98,USA,96.5
5,Frank,45,Chicago,62,71,USA,66.5


### Removing a Column (`.drop()`)

In [4]:
# Drop a single column
# axis=1 specifies that we are dropping a column
# inplace=False (default) returns a new DataFrame, the original is unchanged
df.drop('Country', axis=1)

Unnamed: 0,Name,Age,City,Score1,Score2,AvgScore
0,Alice,25,NY,88,91,89.5
1,Bob,30,LA,92,89,90.5
2,Charlie,35,Chicago,78,82,80.0
3,David,40,NY,85,88,86.5
4,Eva,22,LA,95,98,96.5
5,Frank,45,Chicago,62,71,66.5


In [5]:
# Drop multiple columns
df_dropped = df.drop(columns=['Score1', 'Score2'])
df_dropped

Unnamed: 0,Name,Age,City,Country,AvgScore
0,Alice,25,NY,USA,89.5
1,Bob,30,LA,USA,90.5
2,Charlie,35,Chicago,USA,80.0
3,David,40,NY,USA,86.5
4,Eva,22,LA,USA,96.5
5,Frank,45,Chicago,USA,66.5


### Renaming Columns (`.rename()`)

In [6]:
df.rename(columns={'Name': 'FullName', 'AvgScore': 'AverageScore'})

Unnamed: 0,FullName,Age,City,Score1,Score2,Country,AverageScore
0,Alice,25,NY,88,91,USA,89.5
1,Bob,30,LA,92,89,USA,90.5
2,Charlie,35,Chicago,78,82,USA,80.0
3,David,40,NY,85,88,USA,86.5
4,Eva,22,LA,95,98,USA,96.5
5,Frank,45,Chicago,62,71,USA,66.5


## 6.2 Applying Functions

### `.map()` (Element-wise for Series)
Applies a function to each element of a Series.

In [7]:
# Use a dictionary to map city names to full names
city_map = {'NY': 'New York', 'LA': 'Los Angeles', 'Chicago': 'Chicago'}
df['CityFull'] = df['City'].map(city_map)
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull
0,Alice,25,NY,88,91,USA,89.5,New York
1,Bob,30,LA,92,89,USA,90.5,Los Angeles
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago
3,David,40,NY,85,88,USA,86.5,New York
4,Eva,22,LA,95,98,USA,96.5,Los Angeles
5,Frank,45,Chicago,62,71,USA,66.5,Chicago


### `.apply()` (Row/Column-wise)
Applies a function along an axis of the DataFrame.

In [8]:
# Apply to a column (Series)
df['Age_plus_5'] = df['Age'].apply(lambda x: x + 5)
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5
0,Alice,25,NY,88,91,USA,89.5,New York,30
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40
3,David,40,NY,85,88,USA,86.5,New York,45
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50


In [9]:
# Apply along columns (axis=0, default)
df[['Score1', 'Score2']].apply(np.mean)

Score1    83.333333
Score2    86.500000
dtype: float64

In [10]:
# Apply along rows (axis=1)
# Let's find the range of scores for each student
def score_range(row):
    return row['Score2'] - row['Score1']

df['ScoreRange'] = df.apply(score_range, axis=1)
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange
0,Alice,25,NY,88,91,USA,89.5,New York,30,3
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4
3,David,40,NY,85,88,USA,86.5,New York,45,3
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9


### `.applymap()` (Element-wise for DataFrame) - Deprecated
Applies a function to every element of a DataFrame. `df.map()` is now preferred.

In [11]:
# Let's format all numeric scores as strings with 2 decimal places
numeric_scores = df[['Score1', 'Score2']]
numeric_scores.map(lambda x: f'{x:.2f}')

Unnamed: 0,Score1,Score2
0,88.0,91.0
1,92.0,89.0
2,78.0,82.0
3,85.0,88.0
4,95.0,98.0
5,62.0,71.0


## 6.3 Binning & Discretization

Binning is the process of turning continuous numerical data into discrete categories.

### `pd.cut()` (Equal-sized bins)

In [12]:
age_bins = [20, 30, 40, 50]
age_labels = ['20-30', '31-40', '41-50']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=True)
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange,AgeGroup
0,Alice,25,NY,88,91,USA,89.5,New York,30,3,20-30
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3,20-30
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4,31-40
3,David,40,NY,85,88,USA,86.5,New York,45,3,31-40
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3,20-30
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9,41-50


### `pd.qcut()` (Quantile-based bins)
Divides the data into bins with an equal number of observations.

In [13]:
# Divide scores into 3 quantile groups (e.g., low, medium, high)
df['ScoreQuantile'] = pd.qcut(df['AvgScore'], q=3, labels=['Low', 'Medium', 'High'])
df

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange,AgeGroup,ScoreQuantile
0,Alice,25,NY,88,91,USA,89.5,New York,30,3,20-30,Medium
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3,20-30,High
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4,31-40,Low
3,David,40,NY,85,88,USA,86.5,New York,45,3,31-40,Medium
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3,20-30,High
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9,41-50,Low


## 6.4 Sorting & Ranking

### `.sort_values()`

In [14]:
# Sort by a single column
df.sort_values(by='Age')

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange,AgeGroup,ScoreQuantile
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3,20-30,High
0,Alice,25,NY,88,91,USA,89.5,New York,30,3,20-30,Medium
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3,20-30,High
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4,31-40,Low
3,David,40,NY,85,88,USA,86.5,New York,45,3,31-40,Medium
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9,41-50,Low


In [15]:
# Sort by multiple columns, with different orders
df.sort_values(by=['City', 'Age'], ascending=[True, False])

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange,AgeGroup,ScoreQuantile
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9,41-50,Low
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4,31-40,Low
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3,20-30,High
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3,20-30,High
3,David,40,NY,85,88,USA,86.5,New York,45,3,31-40,Medium
0,Alice,25,NY,88,91,USA,89.5,New York,30,3,20-30,Medium


### `.sort_index()`

In [16]:
df_indexed = df.set_index('Name')
df_indexed.sort_index(ascending=False)

Unnamed: 0_level_0,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange,AgeGroup,ScoreQuantile
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9,41-50,Low
Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3,20-30,High
David,40,NY,85,88,USA,86.5,New York,45,3,31-40,Medium
Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4,31-40,Low
Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3,20-30,High
Alice,25,NY,88,91,USA,89.5,New York,30,3,20-30,Medium


### `.rank()`

In [17]:
# Assign ranks based on average score
df['ScoreRank'] = df['AvgScore'].rank(method='average', ascending=False)
df.sort_values(by='ScoreRank')

Unnamed: 0,Name,Age,City,Score1,Score2,Country,AvgScore,CityFull,Age_plus_5,ScoreRange,AgeGroup,ScoreQuantile,ScoreRank
4,Eva,22,LA,95,98,USA,96.5,Los Angeles,27,3,20-30,High,1.0
1,Bob,30,LA,92,89,USA,90.5,Los Angeles,35,-3,20-30,High,2.0
0,Alice,25,NY,88,91,USA,89.5,New York,30,3,20-30,Medium,3.0
3,David,40,NY,85,88,USA,86.5,New York,45,3,31-40,Medium,4.0
2,Charlie,35,Chicago,78,82,USA,80.0,Chicago,40,4,31-40,Low,5.0
5,Frank,45,Chicago,62,71,USA,66.5,Chicago,50,9,41-50,Low,6.0
