# Pandas Part 71: Pandas Arrays and Timedelta

This notebook explores pandas arrays in more detail and focuses on the Timedelta class for representing durations.

In [None]:
import pandas as pd
import numpy as np
import datetime

## 1. Pandas Arrays

The `pandas.array()` function creates arrays with specific data types. Pandas can infer the appropriate array type based on the input data, or you can explicitly specify the dtype.

### Array Type Inference

Pandas will infer an extension dtype for sequences of specific scalar types:

In [None]:
# Integer array (inferred as Int64)
int_array = pd.array([1, 2, 3, None, 5])
print(f"Integer array: {int_array}")
print(f"Dtype: {int_array.dtype}")

In [None]:
# String array
str_array = pd.array(['a', 'b', None, 'd'])
print(f"String array: {str_array}")
print(f"Dtype: {str_array.dtype}")

In [None]:
# Boolean array
bool_array = pd.array([True, False, None, True])
print(f"Boolean array: {bool_array}")
print(f"Dtype: {bool_array.dtype}")

In [None]:
# Datetime array
dt_array = pd.array([datetime.datetime(2023, 1, 1), datetime.datetime(2023, 1, 2), None])
print(f"Datetime array: {dt_array}")
print(f"Dtype: {dt_array.dtype}")

In [None]:
# Timedelta array
td_array = pd.array([datetime.timedelta(days=1), datetime.timedelta(hours=5), None])
print(f"Timedelta array: {td_array}")
print(f"Dtype: {td_array.dtype}")

### Explicitly Specifying dtype

In [None]:
# Create an integer array with explicit dtype
int_array_explicit = pd.array([1, 2, 3, None, 5], dtype='Int32')
print(f"Integer array (Int32): {int_array_explicit}")
print(f"Dtype: {int_array_explicit.dtype}")

In [None]:
# Create a string array with explicit dtype
str_array_explicit = pd.array(['a', 'b', None, 'd'], dtype='string')
print(f"String array (string): {str_array_explicit}")
print(f"Dtype: {str_array_explicit.dtype}")

### Creating Series and DataFrames from Arrays

In [None]:
# Create a Series from an array
s = pd.Series(int_array)
print("Series from integer array:")
print(s)
print(f"Dtype: {s.dtype}")

In [None]:
# Create a DataFrame from arrays
df = pd.DataFrame({
    'integers': int_array,
    'strings': str_array,
    'booleans': bool_array
})
print("DataFrame from arrays:")
print(df)
print("\nDtypes:")
print(df.dtypes)

### Extracting Arrays from Series

In [None]:
# Create a Series
s = pd.Series([1, 2, 3, None, 5], dtype='Int64')

# Extract the array from the Series
arr = s.array
print(f"Array extracted from Series: {arr}")
print(f"Dtype: {arr.dtype}")

## 2. Timedelta in Pandas

Pandas `Timedelta` represents a duration, the difference between two dates or times. It's the pandas equivalent of Python's `datetime.timedelta` and is interchangeable with it in most cases.

### Creating Timedeltas

In [None]:
# Create Timedeltas using different constructors

# From a string
td1 = pd.Timedelta('1 days 2 hours 3 minutes 4 seconds')
print(f"From string: {td1}")

# From keyword arguments
td2 = pd.Timedelta(days=1, hours=2, minutes=3, seconds=4)
print(f"From kwargs: {td2}")

# From an integer with a unit
td3 = pd.Timedelta(86400, unit='s')  # 1 day in seconds
print(f"From integer with unit: {td3}")

# From a Python datetime.timedelta
td4 = pd.Timedelta(datetime.timedelta(days=1, hours=2))
print(f"From datetime.timedelta: {td4}")

### Timedelta Properties and Attributes

In [None]:
# Create a Timedelta for demonstration
td = pd.Timedelta('1 days 2 min 3 us 42 ns')

# Access various attributes
print(f"Timedelta: {td}")
print(f"Days: {td.days}")
print(f"Seconds: {td.seconds}")
print(f"Microseconds: {td.microseconds}")
print(f"Nanoseconds: {td.nanoseconds}")

# Access components as a named tuple
components = td.components
print("\nComponents:")
print(f"Days: {components.days}")
print(f"Hours: {components.hours}")
print(f"Minutes: {components.minutes}")
print(f"Seconds: {components.seconds}")
print(f"Milliseconds: {components.milliseconds}")
print(f"Microseconds: {components.microseconds}")
print(f"Nanoseconds: {components.nanoseconds}")

### NumPy Representation (asm8)

In [None]:
# Access the NumPy timedelta64 array scalar view
td1 = pd.Timedelta('1 days 2 min 3 us 42 ns')
print(f"Timedelta: {td1}")
print(f"asm8: {td1.asm8}")

td2 = pd.Timedelta('2 min 3 s')
print(f"\nTimedelta: {td2}")
print(f"asm8: {td2.asm8}")

td3 = pd.Timedelta('3 ms 5 us')
print(f"\nTimedelta: {td3}")
print(f"asm8: {td3.asm8}")

td4 = pd.Timedelta(42, unit='ns')
print(f"\nTimedelta: {td4}")
print(f"asm8: {td4.asm8}")

### Timedelta Arithmetic

In [None]:
# Timedelta addition
td1 = pd.Timedelta(days=1)
td2 = pd.Timedelta(hours=12)
print(f"td1 + td2 = {td1 + td2}")

# Timedelta subtraction
print(f"td1 - td2 = {td1 - td2}")

# Timedelta multiplication
print(f"td1 * 2 = {td1 * 2}")

# Timedelta division
print(f"td1 / 2 = {td1 / 2}")
print(f"td1 / td2 = {td1 / td2}")  # Returns a float

### Timedelta with Datetime

In [None]:
# Create a datetime
dt = pd.Timestamp('2023-01-01 12:00:00')
print(f"Datetime: {dt}")

# Add a Timedelta to a datetime
td = pd.Timedelta(days=2, hours=6)
print(f"Timedelta: {td}")
print(f"dt + td = {dt + td}")

# Subtract a Timedelta from a datetime
print(f"dt - td = {dt - td}")

### Timedelta Series and DataFrames

In [None]:
# Create a Series of Timedeltas
td_series = pd.Series([
    pd.Timedelta(days=1),
    pd.Timedelta(hours=12),
    pd.Timedelta(minutes=30),
    pd.Timedelta(seconds=45)
])
print("Timedelta Series:")
print(td_series)
print(f"Dtype: {td_series.dtype}")

In [None]:
# Operations on Timedelta Series
print("2 * td_series:")
print(2 * td_series)

print("\ntd_series.mean():")
print(td_series.mean())

In [None]:
# Create a DataFrame with Timedeltas
df = pd.DataFrame({
    'td1': [pd.Timedelta(days=1), pd.Timedelta(hours=12), pd.Timedelta(minutes=30)],
    'td2': [pd.Timedelta(hours=2), pd.Timedelta(minutes=15), pd.Timedelta(seconds=45)]
})
print("DataFrame with Timedeltas:")
print(df)

# Sum Timedeltas across columns
print("\nSum across columns:")
print(df.sum(axis=1))

### Timedelta Range

In [None]:
# Create a range of Timedeltas
td_range = pd.timedelta_range(start='1 day', end='5 days', freq='1D')
print("Timedelta range (daily):")
print(td_range)

# Create a range with a specific number of periods
td_range2 = pd.timedelta_range(start='1 hour', periods=5, freq='30min')
print("\nTimedelta range (30 minutes):")
print(td_range2)

### Practical Example: Calculating Time Differences

In [None]:
# Create a DataFrame with start and end times
df = pd.DataFrame({
    'start_time': pd.to_datetime(['2023-01-01 08:00:00', '2023-01-01 09:30:00', '2023-01-01 13:45:00']),
    'end_time': pd.to_datetime(['2023-01-01 10:15:00', '2023-01-01 11:00:00', '2023-01-01 17:30:00'])
})
print("DataFrame with start and end times:")
print(df)

# Calculate duration
df['duration'] = df['end_time'] - df['start_time']
print("\nWith duration:")
print(df)

# Extract hours and minutes
df['hours'] = df['duration'].dt.components.hours
df['minutes'] = df['duration'].dt.components.minutes
print("\nWith hours and minutes:")
print(df)