# Pandas

## Creating DataFrames

In [None]:
import json
import pandas as pd

class Person:
    NAME = 'name'
    AGE = 'age'
    CITY = 'city'
    SALARY = 'salary'

# from dict
data = {
    Person.NAME : ['Alice', 'Bob', 'Charlie', 'David'],
    Person.AGE : [25, 30, 35, 40],
    Person.CITY : ['NYC', 'SF', 'LA', 'NYC'],
    Person.SALARY : [70000, 80000, 90000, 95000]
}

df_from_dict = pd.DataFrame(data)
print(df_from_dict[Person.NAME])

# From list of dictionaries
records = [
    {'name': 'Alice', 'age': 25},
    {'name': 'Bob', 'age': 30}
]

df_from_list = pd.DataFrame(records)
print(df_from_list)

# from JSON
json_data = '[{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]'
json_parsed = json.loads(json_data)
df_from_json = pd.DataFrame(json_parsed)
print(df_from_json)

## Basic operations

In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np

is_printable = False

np.random.seed(42)

data = {
    'name': [f'Person_{i}' for i in range(1, 51)],
    'age': np.random.randint(18, 65, 50),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose'], 50),
    'salary': np.random.randint(30000, 120000, 50)
}

basic_df: DataFrame = pd.DataFrame(data)

if is_printable:
    print("First 5 rows:")
    print(basic_df.head())
    print("\nLast 5 rows:")
    print(basic_df.tail())
    print("\nDataFrame info:")
    print(basic_df.info())
    print("\nDescriptive statistics:")
    print(basic_df.describe())
    print("\nMean age:")
    print(basic_df['age'].mean())
    print("\nMedian age:")
    print(basic_df['age'].median())
    print("\nManual median calculation:")

row_count = len(basic_df)
if( row_count % 2 == 0):
    median = (basic_df.sort_values('age').iloc[row_count // 2] + basic_df.sort_values('age').iloc[row_count // 2 - 1]) / 2
else:
    median = basic_df.sort_values('age').iloc[row_count // 2]
print(median)

if is_printable:
    print(basic_df.sort_values('age').iloc[24])
    print("\nStandard deviation of age:")
    print(basic_df['age'].std())


## Pandera - Pandas Type-checking

In [None]:
import pandas as pd
import pandera.pandas as pa
from pandera.typing import Series, DataFrame

class UserSchema(pa.DataFrameModel):
    name: Series[str] = pa.Field()
    age: Series[int] = pa.Field()
    city: Series[str] = pa.Field()

df: DataFrame[UserSchema] = UserSchema.validate(pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age' : [25, 30, 40],
    'city': ['NYC', 'SF', 'LA']
}))