### Intro to Dataframes

In [None]:
import pandas as pd

df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['A','B','C'], index=['x','y','z'])

In [None]:
df.describe()

### Loading in Dataframes from Files


In [None]:
coffee = pd.read_csv("./warmup-data/coffee.csv")

results = pd.read_parquet("./data/results.parquet")

olympics = pd.read_excel("./data/olympics-data.xlsx")

bios = pd.read_csv("./data/bios.csv")


### Accessing data

In [None]:
coffee.sample(10)

In [None]:
coffee.loc[0:6,["Day","Units Sold"]]

In [None]:
coffee.iloc[0:6,[0,2]] # Only using index

In [None]:
coffee.loc[1:4,"Unit Solds"] = 10

coffee.head()

In [None]:
coffee.at[0,"Unit Solds"]=100

In [None]:
coffee.sort_values("Unit Solds",ascending=True)

coffee.head()

In [None]:
for idx , row in coffee.iterrows():
    print(idx)
    print(row)
    print("\n")

### Filtering data

In [None]:
bios.loc[bios['height_cm']>215,["name","height_cm"]]

In [None]:
bios[(bios["height_cm"]>215)][["name","height_cm"]]

In [None]:
bios[(bios["height_cm"]>215) & (bios["born_country"]=="USA")]

In [None]:
bios[bios["name"].str.contains("Hristo",case=False)]

In [None]:
# Top 5 tallest overall
bios.nlargest(5, 'height_cm')[['name', 'height_cm', 'born_country']]  # top 5 tallest rows


In [None]:
# Tallest person in each country (per-group max)
bios[bios['height_cm'].eq(bios.groupby('born_country')['height_cm'].transform('max'))][['name', 'born_country', 'height_cm']]  # tallest per country

In [None]:
# Filter by a set of countries
bios[bios['born_country'].isin(['USA', 'Germany', 'Bulgaria'])][['name', 'born_country', 'height_cm']]  # only selected countries


In [None]:
# Exclude certain countries
bios[~bios['born_country'].isin({'USA', 'Canada'})][['name', 'born_country', 'height_cm']].head(10)  # not USA/Canada


In [None]:
# Names that start with A or H (case-insensitive)
bios[bios['name'].str.contains(r'^(A|H)', case=False, na=False)][['name']].head(10)  # anchors at start


In [None]:
# Duplicate names (possible homonyms)
bios[bios['name'].duplicated(keep=False)].sort_values('name')[['name', 'born_country', 'height_cm']]  # all duplicates


In [None]:
# Data-quality check: names containing digits
bios[bios['name'].str.contains(r'\d', na=False)][['name']]

In [None]:
# Shortest person in each country (per-group min via sort+drop_duplicates)
(bios.sort_values(['born_country', 'height_cm'])
     .drop_duplicates('born_country'))[['name', 'born_country', 'height_cm']]  # shortest per country


In [None]:
bios.query('born_country== "USA" and height_cm > 200')[['name', 'born_country', 'height_cm']]

### Adding / Removing Columns

In [None]:
coffee['price'] = 5

import numpy as np

coffee['new_price'] = np.where(coffee['Coffee Type'] == 'Espresso',3.99,4.99)

coffee.drop(columns=['price'], inplace=True)

coffee.head()

In [None]:
## Pointing to the same memory space
coffe_new = coffee

## New one , not modifying the original
coffe_new = coffee.copy()

In [None]:
coffee.head()

In [None]:
coffee['revenue'] = coffee['Units Sold'] * coffee['new_price']

In [None]:
coffee.rename(columns={'new_price':'price'}, inplace=True)

In [None]:
bios.head()

In [None]:
bios['first_name'] = bios['name'].str.split(' ').str[0]


bios['born_data_time']=pd.to_datetime(bios['born_date'])

In [None]:
bios['height_category'] = bios['height_cm'].apply(lambda x : 'Short' if x < 165 else ('Average' if x<185 else 'Tall'))

In [None]:
def categorize_athlete(row):
    if row['height_cm'] < 175 and row['weight_kg'] < 70:
        return 'Lightweight'
    elif row['height_cm'] < 185 and row['weight_kg'] <= 80:
        return 'Middleweight'
    else:
        return 'Heavyweight'

bios['category'] = bios.apply(categorize_athlete, axis=1)

### Merging & Concatenating Data

In [None]:
nocs = pd.read_csv('./data/noc_regions.csv')

In [None]:
nocs.head()

In [None]:
bios = pd.merge(bios, nocs, left_on='born_country', right_on='NOC', how='left')

bios.rename(columns={'region':'born_country_full'},inplace=True)

In [None]:
usa = bios[bios.born_country=='USA'].copy()
gbr = bios[bios.born_country=='GBR'].copy()

new_df = pd.concat([usa,gbr])

new_df.head()

### Handling Null Values

In [None]:
coffee.loc[[0,1],'Units Sold'] = np.nan

In [None]:
display(coffee)

In [None]:
coffee= coffee.fillna(coffee['Units Sold'].mean())

In [None]:
coffee.dropna(subset=['Units Sold'], inplace=True)

In [None]:
coffee[coffee['Units Sold'].notna()]

### Aggregating Data

In [None]:
bios.head()

In [None]:
bios['born_city'].value_counts()

In [None]:
bios[bios['born_country']=='USA']['born_region'].value_counts()

In [None]:
coffee.groupby(['Coffee Type'])['Units Sold'].sum()

In [None]:
coffee.groupby(['Coffee Type']).agg({'Units Sold':'sum','price':'mean'})

In [None]:
pivot = coffee.pivot(columns='Coffee Type', index='Day' , values = 'revenue')

pivot

In [None]:
bios.groupby(bios['born_date'])['name'].count().reset_index()

In [152]:
# Total, average, min, max height per country
bios.groupby('born_country').agg({
    'height_cm': ['count', 'mean', 'min', 'max', 'std']
})


Unnamed: 0_level_0,height_cm,height_cm,height_cm,height_cm,height_cm
Unnamed: 0_level_1,count,mean,min,max,std
born_country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AFG,24,170.416667,160.0,183.0,6.580053
AGU,1,186.000000,186.0,186.0,
ALB,38,174.657895,159.0,195.0,10.418996
ALG,141,176.489362,155.0,196.0,7.681720
AND,31,172.096774,160.0,185.0,6.934719
...,...,...,...,...,...
VIN,5,181.200000,173.0,187.0,5.674504
Vienna,3,171.000000,164.0,183.0,10.440307
YEM,4,169.000000,160.0,175.0,6.683313
ZAM,52,173.192308,145.0,191.0,9.277928


In [153]:
bios.groupby('born_country').agg(
    num_people=('name', 'count'),
    avg_height=('height_cm', 'mean'),
    tallest=('height_cm', 'max'),
    shortest=('height_cm', 'min')
)


Unnamed: 0_level_0,num_people,avg_height,tallest,shortest
born_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFG,25,170.416667,183.0,160.0
AGU,2,186.000000,186.0,186.0
ALB,45,174.657895,195.0,159.0
ALG,177,176.489362,196.0,155.0
AND,37,172.096774,185.0,160.0
...,...,...,...,...
VIN,8,181.200000,187.0,173.0
Vienna,11,171.000000,183.0,164.0
YEM,4,169.000000,175.0,160.0
ZAM,64,173.192308,191.0,145.0


In [154]:
# Top 10 countries with highest average height
bios.groupby('born_country')['height_cm'].mean().sort_values(ascending=False).head(10)


born_country
Milde       196.500000
Prignitz    194.000000
GIB         191.000000
AGU         186.000000
MNE         185.982456
Cologne     184.500000
TGA         184.500000
SRB         184.397206
BIH         184.272251
STP         184.250000
Name: height_cm, dtype: float64

In [157]:
# Ensure born_country is a column
if 'born_country' not in bios.columns:
    bios = bios.reset_index()

# Make sure height_cm is numeric
bios['height_cm'] = pd.to_numeric(bios['height_cm'], errors='coerce')

idx = bios.groupby('born_country')['height_cm'].idxmax()
idx = idx.dropna().astype(int)   # remove groups where max is NaN

out = bios.loc[idx, ['born_country', 'name', 'height_cm']].sort_values('born_country')
print(out)


       born_country             name  height_cm
116031          AFG  Rohullah Nikpai      183.0
68457           AGU     Keith Connor      186.0
108191          ALB     Donald Suxho      195.0
129661          ALG      Farid Chaal      196.0
118038          AND      Lluís Marín      185.0
...             ...              ...        ...
78733           VIN    Eswort Coombs      187.0
64422        Vienna    Emil Janausch      183.0
118969          YEM   Nabil Al-Garbi      175.0
117184          ZAM      Henry Nwume      191.0
102452          ZIM    Brendan Ashby      205.0

[230 rows x 3 columns]
