In [None]:
import numpy as np
import pandas as pd

# Create example objects

In [None]:
index = pd.date_range('11/22/2024', periods=8)
index

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

In [None]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C'])
df

# `head` and `tail`

In [None]:
# Create Series with many many entries
long_series = pd.Series(np.random.randn(1000))

In [None]:
# View first few rows of Series
long_series.head()

In [None]:
# View last few rows of Series
long_series.tail()

# Attributes and underlying data

In [None]:
# Check shape of Series
s.shape

In [None]:
# Check shape of DataFrame
df.shape

In [None]:
# Check axis labels of Series
s.axes

In [None]:
# Check axis labels of DataFrame
df.axes

In [None]:
# Get data from Series
s.array

In [None]:
# Get data from Series Index
s.index.array

In [None]:
# Convert Series to ndarray
s.to_numpy()

In [None]:
# Convert Series to ndarray
np.asarray(s)

In [None]:
# Convert time series to ndarray, preserving timezones
ts = pd.date_range('2024', periods=2, tz='CET')
ts.to_numpy(dtype=object)

In [None]:
# Convert time series to ndarray, dropping timezones
ts.to_numpy(dtype='datetime64[ns]')

In [None]:
# Get data from DataFrame with a single dtype for all columns as ndarray
df.to_numpy()

- Previously, `Series.values` and `DataFrame.values` were used to extract data from `Series` and `DataFrame` objects
- It is preferred now to use `Series.array` and `DataFrame.array`
  - `.values` may return an `ndarray` instead of an `ExtensionArray`, which involves expensive copy operations
  - `.array` does not copy
  - `.to_numpy()` makes it explicit that coercion and copying may occur

# Accelerated operations

- `numexpr` library -- Smart chunking, caching, multiple cores
- `bottleneck` library -- Cython routines, esp fast for arrays with `nan` values

In [None]:
# Enabling `numexpr`
# pd.set_option('compute.use_numexpr', True)

In [None]:
# Enabling `bottleneck`
# pd.set_option('compute.use_bottleneck', True)

# Flexible binary options 

## Matching/broadcasting behavior

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})

df

In [None]:
# Select row from DataFrame
row = df.iloc[1]
row

In [None]:
# Select column from DataFrame
column = df['two']
column

In [None]:
# Subtract row from DataFrame, aligning operation along columns
df.sub(row, axis='columns')

In [None]:
# Subtract column from DataFrame, aligning operation along index
df.sub(column, axis='index')

In [None]:
# Make DataFrame with MultiIndex
dfmi = df.copy()

dfmi.index = pd.MultiIndex.from_tuples(
    [(1, 'a'), (1, 'b'), (1, 'c'), (2, 'a')], names=['first', 'second']
)

dfmi

In [None]:
# Subtract column from MultiIndex DataFrame, aligning operation along second index
dfmi.sub(column, axis='index', level='second')

In [None]:
# Take floor division and modulo operation on Series
r = pd.Series(np.arange(10))

dividend, remainder = divmod(r, 3)

In [None]:
# Dividend
dividend

In [None]:
# Remainder
remainder

In [None]:
# Perform element-wise divmod()
r1 = pd.Series(np.arange(10))
r2 = pd.Series([2, 2, 3, 3, 4, 4, 5, 5, 6, 6])

dividend, remainder = divmod(r1, r2)

In [None]:
# Dividend
dividend

In [None]:
# Remainder
remainder

## Missing data/operations with fill values

In [None]:
# Notice that DataFrame has missing values at
# ('a', 'three')
# ('d', 'one')

df

In [None]:
# Copy DataFrame and fill in a missing value

df2 = df.copy()
df2.loc['a', 'three'] = 1.0

df2

In [None]:
# Add DataFrame and copy
# Places where either DataFrame has missing values are marked with NaN
df + df2

In [None]:
# Add DataFrame and copy
# Use fill_value to be used when just one DataFrame is missing a value
# Where both DataFrames are missing values, the result will still be NaN
df.add(df2, fill_value=0)

## Flexible comparisons

In [None]:
# Find where DataFrame is greater than copy
# Notice that places where one or both DataFrames have missing values are marked with False
df.gt(df2) 

In [None]:
# Find where copy is not equal to DataFrame
# Notice that places where both DataFrames have missing values are marked with True,
# implying that NaN != NaN
df2.ne(df)

## Boolean reductions

In [None]:
# Find where DataFrame is greater than 0
df > 0

In [None]:
# Find which rows of DataFrame have values greater than 0
(df > 0).any()

In [None]:
# Find whether any row of DataFrame has a value greater than 0
(df > 0).any().any()

In [None]:
# Check if DataFrame is empty
df.empty

In [None]:
# Check if newly created DataFrame is empty
pd.DataFrame(columns=list('ABC')).empty

## Comparing if objects are equivalent

In [None]:
# Add DataFrame to itself
df + df

In [None]:
# Multiply DataFrame by 2
df * 2

In [None]:
# Check if DataFrame + DataFrame is equal to DataFrame * 2?
# If DataFrame has any missing values, this opertion will yield False
(df + df == df * 2).all()

In [None]:
# Check if NaN is equal to NaN
np.nan == np.nan

In [None]:
# Check if DataFrame + DataFrame is equal to DataFrame * 2
# Treat NaNs as equal -- i.e. two DataFrames where all corresponding non-NaN values are equal,
# and all NaN values are in the same locations, are considered equal
(df + df).equals(df * 2)

In [None]:
# Create DataFrames with the same index and column values, but in different orders
df1 = pd.DataFrame({
    'col': ['foo', 0, np.nan]
})
df1

In [None]:
# Create DataFrames with the same index and column values, but in different orders
df2 = pd.DataFrame({
    'col': [np.nan, 0, 'foo'],
}, index=[2, 1, 0])
df2

In [None]:
# Check if DataFrames which have the same column values associated with the same index values,
# but where the indices have different orders,
# are equal
df1.equals(df2)

In [None]:
# Reorder the indices of the second DataFrame to match the first
# Check equality
df1.equals(df2.sort_index())

## Comparing array-like objects

In [None]:
# Find where Series values are equal to some given value
pd.Series(['foo', 'bar', 'baz']) == 'foo'

In [None]:
# Find where Index values are equal to some given value
pd.Index(['foo', 'bar', 'baz']) == 'foo'

In [None]:
# Find where Series and Index values match
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

In [None]:
# Find where Series and ndarray values match
pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])

## Combining overlapping data sets

In [None]:
# Create DataFrame with many missing values
df1 = pd.DataFrame({
    'A': [1.0, np.nan, 3.0, 5.0, np.nan],
    'B': [np.nan, 2.0, 3.0, np.nan, 6.0]
})

df1

In [None]:
# Create DataFrame with fewer missing values
df2 = pd.DataFrame({
    'A': [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
    'B': [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]
})

df2

In [None]:
# Combine df1 with df2 by filling missing values in df1 with values in df2
df1.combine_first(df2)

## General `DataFrame` combine

In [None]:
# Combine DataFrames using custom rule

def combiner(x, y):
    # If x is missing, use y
    return np.where(pd.isna(x), y, x)

df1.combine(df2, combiner)


# Descriptive statistics

In [None]:
# Compute mean of all columns in DataFrame
df.mean()

In [None]:
# Compute mean of all rows in DataFrame
df.mean(axis=1)

In [None]:
# Compute sum of all columns in DataFrame
# Return NaN for any columns with NaN
df.sum(skipna=False)

In [None]:
# Compute sum of all columns in DataFrame, skipping NaN values
df.sum()

In [None]:
# Standardize all columns in DataFrame
# (Standardization -- transforming data so mean is 0 and std dev is 1)

df_standardized = (df - df.mean()) / df.std()
df_standardized

In [None]:
# Mean of standardized DataFrame
df_standardized.mean()

In [None]:
# Standard deviation of standardized DataFrame
df_standardized.std()

Common functions for performing statisitical calculations on `Series` and `DataFrame` objects:
- `count` -- Number of non-NaN observations
- `sum` -- Sum of values
- `mean` -- Mean of values
- `median` -- Arithmetic median of values
- `min` -- Minimum value
- `max` -- Maximum value
- `mode` -- Mode of values
- `abs` -- Absolute value
- `prod` -- Product of values
- `std` -- Bessel-corrected sample standard deviation
- `var` -- Unbiased variance
- `sem` -- Standard error of the mean
- `skew` -- Sample skewness (3rd moment)
- `kurt` -- Sample kurtosis (4th moment)
- `quantile` -- Sample quantile (value at %)
- `cumsum` -- Cumulative sum
- `cumprod` -- Cumulative product
- `cummax` -- Cumulative maximum
- `cummin` -- Cumulative minimum

In [None]:
# Count number of unique, non-NaN values in Series
r = pd.Series(np.random.randn(500))
r[20:500] = np.nan
r[10:20] = 5
r.nunique()

## Summarizing data: describe

In [None]:
# Generate Series of random numbers
# Make half of the entries NaN
# Get summary statistics

r = pd.Series(np.random.randn(1000))
r[::2] = np.nan
r.describe()

In [None]:
# Generate DataFrame of random numbers
# Make half of the entries NaN
# Get summary statistics

dfr = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
dfr.iloc[::2] = np.nan
dfr.describe()

In [None]:
# Select percentiles to include in Series summary statistics
r.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

In [None]:
# Get summary statistics for non-numeric Series
r = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
r.describe()

In [None]:
# Get summary statistics of numeric columns of a mixed-type DataFrame
df_mixed = pd.DataFrame({
    'a': ['Yes', 'Yes', 'No', 'No'],
    'b': range(4)
})
df_mixed.describe()

In [None]:
# Use include parameter to summarize 'object' column
df_mixed.describe(include=['object'])

In [None]:
# Use include parameter to summarize 'number' column
df_mixed.describe(include=['number'])

In [None]:
# Use include parameter to summarize 'all' columns
df_mixed.describe(include='all')

## Index of min/max values

In [None]:
# Create Series of random numbers
r = pd.Series(np.random.randn(5))
r

In [None]:
# Get indices of minimum and maximum values in Series
r.idxmin(), r.idxmax()

In [None]:
# Create DataFrame of random numbers
dfr = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
dfr

In [None]:
# Get indicies of minimum values in each column
dfr.idxmin()

In [None]:
# Get indicies of minimum values in each row
dfr.idxmax(axis=1)

In [None]:
# Create DataFrame with multiple minimum values
df3 = pd.DataFrame(
    [2, 1, 1, 3, np.nan],
    columns=['A'],
    index=list('edcba')
)
df3

In [None]:
# Get index of first occurance of minimum value in DataFrame
df3.idxmin()

## Value counts (histogramming) / mode

In [None]:
# Create an ndarray of random integers
data = np.random.randint(0, 7, size=50)
data

In [None]:
# Create a Series from the ndarray
# Use value_counts() to generate a histogram of the values
r = pd.Series(data)
r.value_counts()

In [None]:
# Create a dictionary of data
data = {
    'a': [1, 2, 3, 4],
    'b': ['x', 'x', 'y', 'y']
}
data

In [None]:
# Create a DataFrame from the dictionary
# Use value_counts() to count occurrances of pairs of values
df4 = pd.DataFrame(data)
df4.value_counts()

In [None]:
# Get most frequent value in Series
pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]).mode()

In [None]:
# Get most frequent value in DataFrame
pd.DataFrame({
    'A': np.random.randint(0, 7, size=50),
    'B': np.random.randint(-10, 15, size=50)
}).mode()

## Discretization and quantiling

In [None]:
# Create array of random numbers
data = np.random.randn(20)
data

In [None]:
# Bin numbers by value, into 4 bins
pd.cut(data, bins=4)

In [None]:
# Bin numbers by value, using specified bin edges
pd.cut(data, bins=[-5, -1, 0, 1, 5])

In [None]:
# Bin numbers by quantiles
pd.qcut(data, [0, 0.25, 0.5, 0.75, 1])

In [None]:
# Define bins with infinite values
pd.cut(data, bins=[-np.inf, 0, np.inf])

# Function application

## Tablewise function application

In [None]:
# Create DataFrame
dfp = pd.DataFrame({
    "city_and_code": ["Chicago, IL"]
})

dfp

In [None]:
# Define functions to perform on DataFrame

def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df['city_name'] = df['city_and_code'].str.split(',').str.get(0)
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = 'city_name'
    df['city_and_country'] = df[col] + country_name
    return df

In [None]:
# Apply functions to DataFrame without method chaining
add_country_name(extract_city_name(dfp.copy()), country_name='US')

In [None]:
# Apply functions to DataFrame with method chaining
dfp.pipe(extract_city_name).pipe(add_country_name, country_name='US')

## Row or column-wise function application

In [None]:
# Take mean of DataFrame columns by passing each column to lambda function
df.apply(lambda x: np.mean(x))

In [None]:
# Take mean of DataFrame rows by passing each row to lambda function
df.apply(lambda x: np.mean(x), axis=1)

In [None]:
# Take difference between max and min values of DataFrame columns
df.apply(lambda x: x.max() - x.min()) 

In [None]:
# Take cumulative sum of DataFrame columns
df.apply(np.cumsum) 

In [None]:
# Exponentiate each column of DataFrame
df.apply(np.exp)

In [None]:
# Create time series DataFrame with random data
tsdf = pd.DataFrame(
    np.random.randn(1000, 3),
    columns=['A', 'B', 'C'],
    index=pd.date_range('11/22/2024', periods=1000)
)

tsdf.head()

In [None]:
# Find index (date) where maximum value occurs for each column
tsdf.apply(lambda x: x.idxmax())

In [None]:
# Create DataFrame of ones
df_udf = pd.DataFrame(np.ones((2, 2)))
df_udf

In [None]:
# Use apply to perform a function that takes arguments
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

df_udf.apply(subtract_and_divide, args=(5,), divide=3)

In [None]:
# Create time series DataFrame with random data
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=['A', 'B', 'C'],
    index=pd.date_range('11/22/2024', periods=10)
)

tsdf

In [None]:
# Set some DataFrame rows to NaN
tsdf.iloc[3:7] = np.nan
tsdf

In [None]:
# Use apply to interpolate missing values
tsdf.apply(pd.Series.interpolate)

## Aggregation API

In [None]:
# Create time series DataFrame with random data
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=['A', 'B', 'C'],
    index=pd.date_range('11/22/2024', periods=10)
)

tsdf

In [None]:
# Set some DataFrame rows to NaN
tsdf.iloc[3:7] = np.nan
tsdf

In [None]:
# Take sum of each column using agg and a lambda function
tsdf.agg(lambda x: x.sum())

In [None]:
# Take sum of each column using agg and keyword
tsdf.agg('sum')

In [None]:
# Take sum of each column using sum
tsdf.sum()

### Aggregating with multiple functions

In [None]:
# Take sum and mean of each column
tsdf.agg(['sum', 'mean'])

### Aggregating with a dict

In [None]:
# Take mean of column A and sum of column B
tsdf.agg({'A': 'mean', 'B': 'sum'})

In [None]:
# Take mean and min of column A and sum of column B
tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'})

### Custom describe

In [None]:
# Use partial to define quantile functions
from functools import partial

q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = '25%'

q_75 = partial(pd.Series.quantile, q=0.75)
q_75.__name__ = '75%'

# Use agg to create a describe function
tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max'])


## Transform API

In [None]:
# Create time series DataFrame with random data
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=['A', 'B', 'C'],
    index=pd.date_range('11/22/2024', periods=10)
)
tsdf

In [None]:
# Set some DataFrame rows to NaN
tsdf.iloc[3:7] = np.nan
tsdf

In [None]:
# Take absolute value of DataFrame using transform and a function argument
tsdf.transform(np.abs)

In [None]:
# Take absolute value of DataFrame using transform and keyword argument
tsdf.transform('abs')

In [None]:
# Take absolute value of DataFrame using transform and lambda function
tsdf.transform(lambda x: x.abs())

### Transform with multiple functions

In [None]:
# Take absolute value of DataFrame, then add 1 to each value
tsdf.transform([np.abs, lambda x: x + 1])

### Transforming with a dict

In [None]:
# Take absolute value of column A, add 1 to each value of column B
tsdf.transform({'A': np.abs, 'B': lambda x: x + 1})

In [None]:
# Take absolute value of column A, add 1 to and take square root of each value of column B
tsdf.transform({'A': np.abs, 'B': [lambda x: x + 1, np.sqrt]})

## Applying elementwise functions

In [None]:
# Use map to apply a non-vectorizable function to each value in DataFrame
# The lambda function here returns the number of digits in each value

dfc = df.copy()

dfc.map(lambda x: len(str(x)))

In [None]:
# Create a Series with string values
sstr = pd.Series(['six', 'seven', 'six', 'seven', 'six'], index=['a', 'b', 'c', 'd', 'e'])
sstr

In [None]:
# Create a Series with string index and numerical values
smap = pd.Series({'six': 6, 'seven': 7}, name='numbers')
smap

In [None]:
# Use map to replace values in Series with values in another Series
sstr.map(smap)

# Reindexing and altering labels
- Reorder the existing data to match a new set of labels
- Insert missing value markers in label locations where no data existed for that label
- Fill data for missing labels using logic, as specified

In [None]:
# Create Series of random data
r = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
r

In [None]:
# Reindex Series
r.reindex(['e', 'b', 'f', 'd'])

In [None]:
# Print DataFrame
df

In [None]:
# Simultaneously reindex index and columns of a DataFrame
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

In [None]:
# Demonstrate that a Series and DataFrame can share an index
r = s.reindex(df.index)
r.index is df.index

In [None]:
# Reindex DataFrame's index
df.reindex(['c', 'f', 'b'], axis='index')

In [None]:
# Reindex DataFrame's columns
df.reindex(['three', 'two', 'one'], axis='columns')

## Reindexing to align with another object

In [None]:
# Create a DataFrame that is a reindexed version of another DataFrame
dfr = df.reindex(index=['a', 'b', 'c'], columns=['one', 'two'])
dfr

In [None]:
# Reindex a DataFrame using another DataFrame's index
df.reindex_like(dfr)

## Aligning objects with each other with `align`

In [None]:
# Create a Series with random data
r = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
r

In [None]:
# Create two sub-Series
r1 = r[:4]
r1

In [None]:
# Create two sub-Series
r2 = r[1:]
r2

In [None]:
# Align Series with default join (outer)
r1.align(r2)

In [None]:
# Align Series with inner join
r1.align(r2, join='inner')

In [None]:
# Align Series with left join
r1.align(r2, join='left')

In [None]:
# Align DataFrame indices and columns
dfc = df.copy()
df.align(dfr, join='inner')

In [None]:
# Align DataFrames on specified axis
df.align(dfr, axis=0)

## Filling while reindexing

In [None]:
# Create time series Series with random data
rng = pd.date_range('11/22/2024', periods=8)
ts = pd.Series(np.random.randn(8), index=rng)
ts

In [None]:
# Create subset of time series
ts2 = ts.iloc[[0, 3, 6]]
ts2

In [None]:
# Align Series without filling NaN values
ts.align(ts2)

In [None]:
# Align Series with forward fill
# NaN values in ts2 are filled with the last non-NaN value in ts2
ts.align(ts2, method='ffill')

In [None]:
# Align Series with backward fill
# NaN values in ts2 are filled with the next non-NaN value in ts2
ts.align(ts2, method='bfill')

In [None]:
# Align Series with nearest fill
# NaN values in ts2 are filled with the nearest non-NaN value in ts2
# ts.align(ts2, method='nearest')

## Limits on filling while reindexing

In [None]:
# Align Series with forward fill, limiting the number of NaN values filled
ts.align(ts2, method='ffill', limit=1)

In [None]:
# Align Series with forward fill, setting a maximum distance between index and indexer value
# ts.align(ts2, method='ffill', tolerance='1 day')

## Dropping labels from an axis

In [None]:
# Remove rows from DataFrame by reindexing
df.reindex(df.index.difference(['a', 'd']))


In [None]:
# Remove rows from DataFrame with drop
df.drop(['a', 'd'])

In [None]:
# Remove columns from DataFrame
df.drop(['one'], axis=1)

## Renaming / mapping labels

In [None]:
# Show Series with original row labels
s

In [None]:
# Rename Series row labels
s.rename(str.upper)

In [None]:
# Rename index and column labels of DataFrame
df.rename(
    columns={'one': 'foo', 'two': 'bar'},
    index={'a': 'apple', 'b': 'banana', 'd': 'durian'}
)

In [None]:
# Rename column labels of DataFrame
df.rename({'one': 'foo', 'two': 'bar'}, axis='columns')

In [None]:
# Rename row labels of DataFrame
df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index')

In [None]:
# Rename a Series name
s.rename('foo')

In [None]:
# Create DataFrame with MultiIndex
dfm = pd.DataFrame(
    {
        'x': [1, 2, 3, 4, 5, 6],
        'y': [10, 20, 30, 40, 50, 60]
    },
    index=pd.MultiIndex.from_product(
        [['a', 'b', 'c'],
         [1, 2]],
        names=['letter', 'number']
    )
)
dfm

In [None]:
# Rename higher-level index of DataFrame
dfm.rename_axis(index={'letter': 'abc'})

In [None]:
# Rename lower-level index of DataFrame
dfm.rename_axis(index={'number': 'num'})

In [None]:
# Rename entire MultiIndex of DataFrame
dfm.rename_axis(index=str.upper)

# Iteration

In [None]:
# Create DataFrame
dfi = pd.DataFrame(
    {
        'col1': np.random.randn(3),
        'col2': np.random.randn(3)
    },
    index=['a', 'b', 'c']
)

dfi

In [None]:
# Iterate over DataFrame column labels
for col in dfi:
    print(col)

In [None]:
# Create DataFrame
dfi = pd.DataFrame({
    'a': [1, 2, 3],
    'b': ['a', 'b', 'c']
})

dfi

In [None]:
# Setting DataFrame row values in a loop has no effect
for index, row in dfi.iterrows():
    print("Index:", index)
    print("Row:")
    print(row)
    print()
    row['a'] = 10
    
dfi

## items

In [None]:
# Iterate through (column name, col) pairs using items()
for label, col in dfi.items():
    print("Column Label:", label)
    print("Col:")
    print(col)
    print()

## iterrows

In [None]:
# Iterate over (index, row) pairs using iterrows()
for index, row in dfi.iterrows():
    print("Index:", index)
    print("Row:")
    print(row)
    print()

In [None]:
# Create DataFrame with columns of different data types
df_og = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
df_og.dtypes

In [None]:
# Get row from DataFrame with columns of different data types
# Note that row data types are upcast to the highest common type
row = df_og.iloc[0]
row.dtype

## itertuples 

In [None]:
# Iterate over DataFrame rows as namedtuples
# Rows are not turned into Series
# Row data types are preserved and iteration is faster
for row in dfi.itertuples():
    print(row)

# .dt accessor

In [None]:
# Create a time series Series
ts = pd.Series(pd.date_range('20241122 11:11:11', periods=4))
ts

In [None]:
# Get hours from time series
ts.dt.hour

In [None]:
# Get seconds from time series
ts.dt.second

In [None]:
# Get days of week from time series
ts.dt.dayofweek

In [None]:
# Change time zone of time series
ts.dt.tz_localize('UTC').dt.tz_convert('US/Pacific')

In [None]:
# Change date format of time series
ts.dt.strftime('%Y/%m/%d')

# Vectorized string methods

In [None]:
# Create Series of strings
sstr = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
sstr

In [None]:
# Apply method to convert all strings to lowercase
# Excludes NaN values by default
sstr.str.lower()

# Sorting

In [None]:
# Display original DataFrame
df

## By index

In [None]:
# Create DataFrame with unsorted index and column labels
dfun = df.reindex(index=['a', 'd', 'c', 'b'], columns=['three', 'two', 'one'])
dfun

In [None]:
# Sort rows by index ascending order
dfun.sort_index()

In [None]:
# Sort rows by index descending order
dfun.sort_index(ascending=False)

In [None]:
# Sort columns by label ascending (lexicographical) order
dfun.sort_index(axis=1)

In [None]:
# Sort a single column by index ascending order
dfun['one'].sort_index()

In [None]:
# Create MultiIndex DataFrame
dfmi = pd.DataFrame({
    'a': ['B', 'a', 'C'],
    'b': [1, 2, 3],
    'c': [2, 3, 4]
}).set_index(list('ab'))

dfmi

In [None]:
# Sort rows by first index level
dfmi.sort_index(level='a')

In [None]:
# Sort rows by first index level
# Apply function to labels before sorting
dfmi.sort_index(level='a', key=lambda x: x.str.lower())

## By values

In [None]:
# Create DataFrame 
dfs = pd.DataFrame({
    'one': [2, 1, 1, 1],
    'two': [1, 3, 2, 4],
    'three': [5, 4, 3, 2]
})

dfs

In [None]:
# Sort DataFrame rows by values in column 'two'
dfs.sort_values(by='two')

In [None]:
# Sort DataFrame rows by values in columns 'one' and 'two'
dfs.sort_values(by=['one', 'two'])

In [None]:
# Create Series of string values with missing values
sstr = pd.Series(['A', 'B', np.nan, 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
sstr

In [None]:
# Sort Series by values
sstr.sort_values()

In [None]:
# Sort Series by values, putting NaN values at the beginning
sstr.sort_values(na_position='first')

In [None]:
# Create Series of uppercase and lowercase characters
sstr = pd.Series(['B', 'a', 'C'])
sstr

In [None]:
# Sort Series by values, using the default lexicographical order
sstr.sort_values()

In [None]:
# Sort Series by values, first converting to lowercase
sstr.sort_values(key=lambda x: x.str.lower())

## By indexes and values

In [None]:
# Create MultiIndex DataFrame
idx = pd.MultiIndex.from_tuples([
    ('a', 1),
    ('a', 2),
    ('a', 2),
    ('b', 2),
    ('b', 1),
    ('b', 1)
])

idx.names = ['first', 'second']

dfmi = pd.DataFrame(
    {
        'A': np.arange(6, 0, -1)
    },
    index=idx
)

dfmi

In [None]:
# Sort DataFrame by lower level index, then values in column 'A'
dfmi.sort_values(by=['second', 'A'])

## searchsorted

In [None]:
# Create simple Series
dummy = pd.Series([1, 2, 3])
dummy

In [None]:
# Find indices in Series where elements should be inserted to maintain order
dummy.searchsorted([0, 3])

In [None]:
# Find indices in Series where elements should be inserted to maintain order
dummy.searchsorted([0, 4])

## smallest/largest values

In [None]:
# Create Series of digits 1-10, unsorted
p = pd.Series(np.random.permutation(10))
p

In [None]:
# Sort values of Series
p.sort_values()

In [None]:
# Get 3 smallest values in Series
p.nsmallest(3)

In [None]:
# Get 3 largest values in Series
p.nlargest(3)

In [None]:
# Create DataFrame with simple unsorted data
dfun = pd.DataFrame({
    'a': [-2, -1, 1, 10, 8, 11, -1],
    'b': list('abdceff'),
    'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]
})

dfun

In [None]:
# Get rows with the 3 largest values in column 'a'
dfun.nlargest(3, 'a')

In [None]:
# Get rows with the 5 largest values in columns 'a' and 'c'
dfun.nlargest(5, ['a', 'c'])

In [None]:
# Get rows with 3 smallest values in column 'a'
dfun.nsmallest(3, 'a')

In [None]:
# Get rows with 5 smallest values in columns 'a' and 'c'
dfun.nsmallest(5, ['a', 'c'])

## Sorting by a `MultiIndex` column

In [None]:
# Create MultiIndex column DataFrame
dfs.columns = pd.MultiIndex.from_tuples([
    ('a', 'one'),
    ('a', 'two'),
    ('b', 'three')
])

dfs

In [None]:
# Sort DataFrame by values in column ('a', 'two')
dfs.sort_values(by=('a', 'two'))

# Copying
There are only a few ways to modify a `DataFrame` in place:
- Inserting, deleting, or modifying a column
- Assigning to `index` or `column` attributes
- Directly modifying values via `values` attribute or advanced indexing

Most pandas methods return new objects

# dtypes

**tz-aware datetime:** `datetime64[ns, <tz>]`

**Categorical:** `category`

**period (time spans):** `period[<freq>]`

**sparse:** `Sparse`, `Sparse[int]`, `Sparse[float]`

**intervals:** `interval`, `Interval`, `Interval[<numpy_dtype>]`, `Interval[datetime64[ns, <tz>]]`, `Interval[timedelta64[<freq>]]`

**nullable integer:** `Int8`, `Int16`, `Int32`, `Int64`, `UInt8`, `UInt16`, `UInt32`, `UInt64`    

**nullable float:** `Float32`, `Float64`

**strings:** `string`

**Boolean (with NA):** `boolean`

In [None]:
# Create DataFrame with many different data types
dftypes = pd.DataFrame({
    'A': np.random.rand(3),
    'B': 1,
    'C': 'foo',
    'D': pd.Timestamp('20241122'),
    'E': pd.Series([1.0] * 3).astype('float32'),
    'F': False,
    'G': pd.Series([1] * 3, dtype='int8')
})

dftypes

In [None]:
# Get data types of DataFrame columns
dftypes.dtypes

In [None]:
# Count columns of each data type in DataFrame
dftypes.dtypes.value_counts()

## Defaults

In [None]:
# Create DataFrame with integers to demonstrate default integer dtype
pd.DataFrame([1, 2], columns=['a']).dtypes

In [None]:
# Create DataFrame with floats to demonstrate default float dtype
pd.DataFrame([1.1, 2.2], columns=['a']).dtypes

## Upcasting

## `astype`

In [None]:
# Show original dtypes of DataFrame
df.dtypes

In [None]:
# Change dtype explicitly using astype
df.astype('float32').dtypes

In [None]:
# Create DataFrame with integer values
dfi = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9]
})

dfi

In [None]:
# Show original dtypes of DataFrame
dfi.dtypes

In [None]:
# Convert some columns of DataFrame to another type
dfi[['a', 'b']] = dfi[['a', 'b']].astype(np.uint8)
dfi

In [None]:
# Convert some columns of DataFrame to another type
dfi = dfi.astype(dtype={'a': np.bool_, 'b': np.float64})
dfi.dtypes

## Object conversion

# Selection columns based on `dtype`