# Pandas Part 73: Interval Methods and String Data Types

This notebook explores more details about the Interval class methods and introduces pandas string data types.

In [None]:
import pandas as pd
import numpy as np

## 1. More on Intervals

Let's explore more features of the Interval class.

### Empty Intervals

An interval is considered empty if it doesn't contain any points.

In [None]:
# An interval that contains points is not empty
iv1 = pd.Interval(0, 1, closed='right')
print(f"Interval: {iv1}, is_empty: {iv1.is_empty}")

# Intervals with the same start and end point are empty unless closed on both sides
iv2 = pd.Interval(0, 0, closed='right')
print(f"Interval: {iv2}, is_empty: {iv2.is_empty}")

iv3 = pd.Interval(0, 0, closed='left')
print(f"Interval: {iv3}, is_empty: {iv3.is_empty}")

iv4 = pd.Interval(0, 0, closed='neither')
print(f"Interval: {iv4}, is_empty: {iv4.is_empty}")

# An interval that contains a single point is not empty
iv5 = pd.Interval(0, 0, closed='both')
print(f"Interval: {iv5}, is_empty: {iv5.is_empty}")

### IntervalArray and IntervalIndex with Empty Intervals

In [None]:
# Create an IntervalArray with empty and non-empty intervals
ivs = [pd.Interval(0, 0, closed='neither'), pd.Interval(1, 2, closed='neither')]
iv_array = pd.arrays.IntervalArray(ivs)
print(f"IntervalArray: {iv_array}")
print(f"is_empty: {iv_array.is_empty}")

In [None]:
# Create an IntervalIndex with empty intervals and NaN
ivs = [pd.Interval(0, 0, closed='neither'), np.nan]
iv_index = pd.IntervalIndex(ivs)
print(f"IntervalIndex: {iv_index}")
print(f"is_empty: {iv_index.is_empty}")

### Overlapping Intervals

The `overlaps` method checks whether two Interval objects overlap.

In [None]:
# Create intervals to check for overlap
i1 = pd.Interval(0, 2)
i2 = pd.Interval(1, 3)
i3 = pd.Interval(4, 5)

# Check if intervals overlap
print(f"i1: {i1}, i2: {i2}, overlaps: {i1.overlaps(i2)}")
print(f"i1: {i1}, i3: {i3}, overlaps: {i1.overlaps(i3)}")

In [None]:
# Intervals that share closed endpoints overlap
i4 = pd.Interval(0, 1, closed='both')
i5 = pd.Interval(1, 2, closed='both')
print(f"i4: {i4}, i5: {i5}, overlaps: {i4.overlaps(i5)}")

# Intervals that only share an open endpoint do not overlap
i6 = pd.Interval(0, 1, closed='left')
i7 = pd.Interval(1, 2, closed='right')
print(f"i6: {i6}, i7: {i7}, overlaps: {i6.overlaps(i7)}")

## 2. String Data Types in Pandas

Pandas provides specialized data types for working with text data. The recommended type is `StringDtype` (with the alias "string").

### Creating StringArray

The `StringArray` is an extension array for string data.

In [None]:
# Create a StringArray using pd.array with dtype="string"
string_array = pd.array(['This is', 'some text', None, 'data.'], dtype="string")
print(f"StringArray: {string_array}")
print(f"Type: {type(string_array)}")
print(f"Dtype: {string_array.dtype}")

### StringArray vs Object Dtype

Unlike object dtype arrays, StringArray doesn't allow non-string values.

In [None]:
# Create an object dtype array with mixed types
obj_array = np.array(['1', 1, 2.5, None], dtype=object)
print(f"Object array: {obj_array}")
print(f"Dtype: {obj_array.dtype}")

In [None]:
# Try to create a StringArray with mixed types (this will raise an error)
try:
    pd.array(['1', 1], dtype="string")
except ValueError as e:
    print(f"Error: {e}")

### Comparison Operations with StringArray

For comparison methods, StringArray returns a BooleanArray.

In [None]:
# Create a StringArray
str_arr = pd.array(["a", None, "c"], dtype="string")
print(f"StringArray: {str_arr}")

# Compare with a string
result = str_arr == "a"
print(f"Result of str_arr == 'a': {result}")
print(f"Result type: {type(result)}")
print(f"Result dtype: {result.dtype}")

### Creating Series with StringDtype

In [None]:
# Create a Series with StringDtype
s = pd.Series(['This is', 'some text', None, 'data.'], dtype="string")
print(s)
print(f"Dtype: {s.dtype}")

### String Methods with StringDtype

Series backed by a StringArray support the usual string methods.

In [None]:
# Apply string methods
print("Upper case:")
print(s.str.upper())

print("\nLength:")
print(s.str.len())

print("\nContains 'text':")
print(s.str.contains('text'))

### Converting Between String Types

In [None]:
# Create a Series with object dtype containing strings
s_obj = pd.Series(['This is', 'some text', None, 'data.'])
print("Series with object dtype:")
print(s_obj)
print(f"Dtype: {s_obj.dtype}")

# Convert to StringDtype
s_str = s_obj.astype("string")
print("\nConverted to StringDtype:")
print(s_str)
print(f"Dtype: {s_str.dtype}")

# Convert back to object dtype
s_obj_again = s_str.astype("object")
print("\nConverted back to object dtype:")
print(s_obj_again)
print(f"Dtype: {s_obj_again.dtype}")

### Benefits of StringDtype

1. Type safety: Only strings and missing values are allowed
2. Better handling of missing values with pd.NA
3. Potential for future optimizations

In [None]:
# Create a DataFrame with a string column
df = pd.DataFrame({
    'string_col': pd.Series(['a', 'b', None, 'd'], dtype='string'),
    'object_col': pd.Series(['a', 'b', None, 'd']),
})
print(df)
print("\nDtypes:")
print(df.dtypes)

In [None]:
# Check null values
print("isna() for string_col:")
print(df['string_col'].isna())

print("\nisna() for object_col:")
print(df['object_col'].isna())

### SparseDtype

The PDF also briefly mentions SparseDtype, which is used for data stored in SparseArray. This is useful for data with many repeated values.

In [None]:
# Create a SparseArray
sparse_array = pd.arrays.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0])
print(f"SparseArray: {sparse_array}")
print(f"Dtype: {sparse_array.dtype}")

# Create a SparseArray with a specified fill_value
sparse_array_custom = pd.arrays.SparseArray([1, 1, 0, 1, 1, 0, 1, 1, 1], fill_value=1)
print(f"\nSparseArray with fill_value=1: {sparse_array_custom}")
print(f"Dtype: {sparse_array_custom.dtype}")

In [None]:
# Create a Series with sparse values
s_sparse = pd.Series([0, 0, 1, 0, 0, 2, 0, 0, 0], dtype="Sparse")
print(s_sparse)
print(f"Dtype: {s_sparse.dtype}")

# Memory usage comparison
s_regular = pd.Series([0, 0, 1, 0, 0, 2, 0, 0, 0])
print(f"\nMemory usage of sparse Series: {s_sparse.memory_usage()} bytes")
print(f"Memory usage of regular Series: {s_regular.memory_usage()} bytes")