### Intro to Dataframes

In [None]:
import pandas as pd

df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['A','B','C'], index=['x','y','z'])

In [None]:
df.describe()

### Loading in Dataframes from Files


In [None]:
coffee = pd.read_csv("./warmup-data/coffee.csv")

results = pd.read_parquet("./data/results.parquet")

olympics = pd.read_excel("./data/olympics-data.xlsx")

bios = pd.read_csv("./data/bios.csv")


### Accessing data

In [None]:
coffee.sample(10)

In [None]:
coffee.loc[0:6,["Day","Units Sold"]]

In [None]:
coffee.iloc[0:6,[0,2]] # Only using index

In [None]:
coffee.loc[1:4,"Unit Solds"] = 10

coffee.head()

In [None]:
coffee.at[0,"Unit Solds"]=100

In [None]:
coffee.sort_values("Unit Solds",ascending=True)

coffee.head()

In [None]:
for idx , row in coffee.iterrows():
    print(idx)
    print(row)
    print("\n")

### Filtering data

In [None]:
bios.loc[bios['height_cm']>215,["name","height_cm"]]

In [None]:
bios[(bios["height_cm"]>215)][["name","height_cm"]]

In [None]:
bios[(bios["height_cm"]>215) & (bios["born_country"]=="USA")]

In [None]:
bios[bios["name"].str.contains("Hristo",case=False)]

In [None]:
# Top 5 tallest overall
bios.nlargest(5, 'height_cm')[['name', 'height_cm', 'born_country']]  # top 5 tallest rows


In [None]:
# Tallest person in each country (per-group max)
bios[bios['height_cm'].eq(bios.groupby('born_country')['height_cm'].transform('max'))][['name', 'born_country', 'height_cm']]  # tallest per country

In [None]:
# Filter by a set of countries
bios[bios['born_country'].isin(['USA', 'Germany', 'Bulgaria'])][['name', 'born_country', 'height_cm']]  # only selected countries


In [None]:
# Exclude certain countries
bios[~bios['born_country'].isin({'USA', 'Canada'})][['name', 'born_country', 'height_cm']].head(10)  # not USA/Canada


In [None]:
# Names that start with A or H (case-insensitive)
bios[bios['name'].str.contains(r'^(A|H)', case=False, na=False)][['name']].head(10)  # anchors at start


In [None]:
# Duplicate names (possible homonyms)
bios[bios['name'].duplicated(keep=False)].sort_values('name')[['name', 'born_country', 'height_cm']]  # all duplicates


In [None]:
# Data-quality check: names containing digits
bios[bios['name'].str.contains(r'\d', na=False)][['name']]

In [None]:
# Shortest person in each country (per-group min via sort+drop_duplicates)
(bios.sort_values(['born_country', 'height_cm'])
     .drop_duplicates('born_country'))[['name', 'born_country', 'height_cm']]  # shortest per country
