# Aggregate Operations on Series 

In [2]:
import pandas as pd 
import numpy as np

In [2]:
# Example Series
s = pd.Series([10, 20, 10, 40, 50, 60])

# 1. Aggregation (single function returns scalar, list of functions returns Series)
print("Aggregation with mean:", s.agg('mean'))
print("Aggregation with multiple functions:\n", s.agg(['mean', 'min', 'max']))

# 2. All: Returns True if every value is truthy
print("All values are truthy:", s.all())

# 3. Any: Returns True if at least one value is truthy
print("At least one value is truthy:", s.any())

# 4. Autocorrelation with lag=1
print("Autocorrelation:", s.autocorr(lag=1))

# 5. Correlation with another Series
s2 = pd.Series([10, 20, 30, 40, 50, 60])
print("Pearson correlation with s2:", s.corr(s2))

# 6. Covariance with another Series
print("Covariance with s2:", s.cov(s2))

# 7. Max: Returns maximum value
print("Maximum value:", s.max())

# 8. Min: Returns minimum value
print("Minimum value:", s.min())

# 9. Mean: Returns mean value
print("Mean value:", s.mean())

# 10. Median: Returns median value
print("Median value:", s.median())

# 11. Product: Returns product of values
print("Product of values:", s.prod())

# 12. Quantile: Returns specified quantile
print("50% Quantile:", s.quantile(q=0.5))
print("Quantiles: \n", s.quantile(q=[.1 , .5, .9])) # returns a series 

# 13. Standard Error of Mean (SEM)
print("Standard error of mean:", s.sem())

# 14. Standard Deviation
print("Standard deviation:", s.std())

# 15. Variance
print("Variance:", s.var())

# 16. Skewness
print("Skewness:", s.skew())

# 17. Kurtosis
print("Kurtosis:", s.kurtosis())

# 18. Count of unique items
print("Count of unique items:", s.nunique())

# 19. Count of non-missing items
print("Count of non-missing items:", s.count())

# 20. Size: Number of items in Series
print("Size of Series:", s.size)

# 21. Is Unique: Check if all values are unique
print("All values are unique:", s.is_unique)

# 23. Is Monotonic Increasing
print("Values are monotonic increasing:", s.is_monotonic_increasing)

# 24. Is Monotonic Decreasing
print("Values are monotonic decreasing:", s.is_monotonic_decreasing)

Aggregation with mean: 31.666666666666668
Aggregation with multiple functions:
 mean    31.666667
min     10.000000
max     60.000000
dtype: float64
All values are truthy: True
At least one value is truthy: True
Autocorrelation: 0.7433046224826585
Pearson correlation with s2: 0.9254821475438165
Covariance with s2: 370.0
Maximum value: 60
Minimum value: 10
Mean value: 31.666666666666668
Median value: 30.0
Product of values: 240000000
50% Quantile: 30.0
Quantiles: 
 0.1    10.0
0.5    30.0
0.9    55.0
dtype: float64
Standard error of mean: 8.724168218868268
Standard deviation: 21.36976056643281
Variance: 456.6666666666667
Skewness: 0.23226763043061902
Kurtosis: -2.149821514198946
Count of unique items: 5
Count of non-missing items: 6
Size of Series: 6
All values are unique: False
Values are monotonic increasing: False
Values are monotonic decreasing: False


# Conversion Methods on Series 

In [14]:
# using astype method 
series_1 = pd.Series(data=[12,3,4,5,6,7,8,9])
display(series_1.dtype) # default is int64

# check the range of int64 using numpy 
display(np.iinfo('int64')) #iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)
# clearly this is waste of memory 
display(np.iinfo('int8'))#iinfo(min=-128, max=127, dtype=int8)
# Well! int8 is the best fir for our data 

# convert to int8 
series_1 = series_1.astype('int8')
display(series_1.dtype) # int8

# notics that our data has only positive values and if we know that negative values are not coming then we have uint8
np.iinfo('uint8') # iinfo(min=0, max=255, dtype=uint8)

# convert to uint8 
series_1 = series_1.astype('uint8')
display(series_1.dtype) # uint8

dtype('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

iinfo(min=-128, max=127, dtype=int8)

dtype('int8')

dtype('uint8')

In [16]:
# we also have float16 and float64
import numpy as np

info = np.finfo(np.float64)
print("Resolution:", info.resolution)
print("Minimum:", info.min)
print("Maximum:", info.max)
print("Data Type:", info.dtype)


Resolution: 1e-15
Minimum: -1.7976931348623157e+308
Maximum: 1.7976931348623157e+308
Data Type: float64


The `finfo` function in NumPy provides information about floating-point data types, specifically their limits and properties. Here’s a breakdown of the parameters in this example:

### Parameters and Properties

1. **`resolution=1e-15`**:
   - This is the **smallest possible difference** between two distinct floating-point numbers of this type (in this case, `float64`) that can still be represented accurately. The resolution here is set to `1e-15`, meaning that numbers smaller than this difference may be indistinguishable or rounded off due to precision limits. For `float64`, this is typically close to machine epsilon, which is approximately `2.22e-16`.

2. **`min=-1.7976931348623157e+308`**:
   - This is the **smallest (most negative)** number that can be represented by a `float64` data type. Any number below this will underflow and be represented as `-inf` (negative infinity). For `float64`, this limit is roughly `-1.7976931348623157 × 10^308`.

3. **`max=1.7976931348623157e+308`**:
   - This is the **largest (most positive)** number that can be represented by a `float64` data type. Any number above this will overflow and be represented as `inf` (positive infinity). The limit for `float64` is approximately `1.7976931348623157 × 10^308`.

4. **`dtype=float64`**:
   - This specifies the **data type** for which the `finfo` information is given, which in this case is `float64`. `float64` refers to a 64-bit floating-point number, which is the standard for representing floating-point numbers in most computing applications.


In [None]:
series_1 = series_1.astype('float16')
print(series_1.dtype)

# float16
np.finfo('float16') # finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

# note: The resolution takes a large hit! 

float16


finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

## Why Conversion matters? - Memory Usage Optimization!

To calculate the memory usage of a Pandas Series, you can use either the `.nbytes` property or the `.memory_usage()` method:

`.nbytes`:

- Shows the memory consumed by the data in the Series alone.
- When working with numeric data, changing data types (e.g., from default integer to Int16) can reduce memory usage significantly.

In [None]:
series_2 = pd.Series(data=[123, 222, 234, 44, 67, 88, 99, 100, 34, 11, 1001])
display(series_2.astype('int64').nbytes) # 88
display(series_2.astype('int16').nbytes)  # 22

88

22

`.memory_usage()`:

- Includes memory consumed by the Series index and, when used with deep=True, counts the memory of objects stored in the Series.
- For object types (like strings), deep=True is needed to include the memory of individual Python objects.

In [29]:
series_3 = pd.Series(data=['banana', 'orange', 'lichi', 'watermelon', 'guava', 'pineapple', 'banana', 'orange', 'lichi', 'watermelon','lichi', 'watermelon', 'guava','banana', 'orange','pineapple', 'banana', 'orange', 'lichi'])

series_3.memory_usage(deep=True)

1341

Memory Optimization with category Data Type:

- Converting object types (like strings) to category can significantly reduce memory usage, as each unique value is stored only once.

In [30]:
series_3.astype('category').memory_usage(deep=True)

706

In [31]:
# Sample DataFrame
df = pd.DataFrame({
    'numeric_str': ['1', '2', '3', None],
    'string_obj': ['apple', 'banana', 'cherry', None],
    'int_col': [1, 2, 3, 4],
    'float_col': [1.1, 2.2, 3.3, None],
})

# Converting data types

# 1. Converting to Python string type
df['numeric_str'] = df['numeric_str'].astype('str')
print("Converted to Python string:\n", df['numeric_str'])

# 2. Converting to Pandas string type (supports pd.NA for missing values)
df['string_obj'] = df['string_obj'].astype('string')
print("\nConverted to Pandas string (supports pd.NA):\n", df['string_obj'])

# 3. Converting to NumPy int64
df['int_col'] = df['int_col'].astype('int64')
print("\nConverted to NumPy int64:\n", df['int_col'])

# 4. Converting to 32-bit signed integer
df['int_col'] = df['int_col'].astype('int32')
print("\nConverted to NumPy int32:\n", df['int_col'])

# 5. Converting to Pandas Int64 (supports pd.NA)
df['int_col_nullable'] = df['numeric_str'].astype('Int64', errors='ignore')
print("\nConverted to Pandas Int64 (supports pd.NA):\n", df['int_col_nullable'])

# 6. Converting to NumPy float64
df['float_col'] = df['float_col'].astype('float64')
print("\nConverted to NumPy float64:\n", df['float_col'])

# 7. Converting to categorical (supports pd.NA)
df['string_obj'] = df['string_obj'].astype('category')
print("\nConverted to categorical:\n", df['string_obj'])

# 8. Converting to datetime (not through astype, use pd.to_datetime)
df['date_str'] = ['2023-01-01', '2023-02-01', None, '2023-04-01']
df['date_col'] = pd.to_datetime(df['date_str'])
print("\nConverted to datetime:\n", df['date_col'])

Converted to Python string:
 0       1
1       2
2       3
3    None
Name: numeric_str, dtype: object

Converted to Pandas string (supports pd.NA):
 0     apple
1    banana
2    cherry
3      <NA>
Name: string_obj, dtype: string

Converted to NumPy int64:
 0    1
1    2
2    3
3    4
Name: int_col, dtype: int64

Converted to NumPy int32:
 0    1
1    2
2    3
3    4
Name: int_col, dtype: int32

Converted to Pandas Int64 (supports pd.NA):
 0       1
1       2
2       3
3    None
Name: int_col_nullable, dtype: object

Converted to NumPy float64:
 0    1.1
1    2.2
2    3.3
3    NaN
Name: float_col, dtype: float64

Converted to categorical:
 0     apple
1    banana
2    cherry
3      <NA>
Name: string_obj, dtype: category
Categories (3, string): [apple, banana, cherry]

Converted to datetime:
 0   2023-01-01
1   2023-02-01
2          NaT
3   2023-04-01
Name: date_col, dtype: datetime64[ns]


In [32]:
# Some other conversions

# Sample Series
s = pd.Series([1, 2, None, 4, '5', 'apple', True])

# 1. Convert types using .convert_dtypes (Pandas 1.x types supporting pd.NA)
s_converted = s.convert_dtypes(
    infer_objects=True,
    convert_string=True,
    convert_integer=True,
    convert_boolean=True,
    convert_floating=True
)
print("Converted to appropriate pandas types (supports NA):\n", s_converted)

# 2. Cast Series to a specific type using .astype
s_int = pd.Series(['1', '2', None, '4']).astype(dtype='Int64', errors='ignore')
print("\nCasted to pandas Int64 with NA support:\n", s_int)

# 3. Convert to datetime using pd.to_datetime
date_series = pd.Series(['2023-01-01', '2023-02-01', None, '2023-04-01'])
date_converted = pd.to_datetime(date_series, errors='raise', dayfirst=False, yearfirst=False)
print("\nConverted to datetime:\n", date_converted)

# 4. Convert Series to NumPy array using .to_numpy
numpy_array = s.to_numpy(dtype=object, copy=True, na_value=np.nan)
print("\nConverted to NumPy array with specified NA handling:\n", numpy_array)

# 5. Convert Series to NumPy array using .values (similar to .to_numpy)
numpy_values = s.values
print("\nConverted to NumPy array using .values:\n", numpy_values)

# 6. Convert Series to DataFrame using .to_frame
s_dataframe = s.to_frame(name='column_name')
print("\nConverted to DataFrame:\n", s_dataframe)

# 7. Define categorical data type using pd.CategoricalDtype
cat_type = pd.CategoricalDtype(categories=['apple', 'banana', 'cherry'], ordered=True)
s_cat = pd.Series(['apple', 'cherry', 'banana', 'apple'], dtype=cat_type)
print("\nSeries with categorical data type:\n", s_cat)

Converted to appropriate pandas types (supports NA):
 0        1
1        2
2     None
3        4
4        5
5    apple
6     True
dtype: object

Casted to pandas Int64 with NA support:
 0       1
1       2
2    <NA>
3       4
dtype: Int64

Converted to datetime:
 0   2023-01-01
1   2023-02-01
2          NaT
3   2023-04-01
dtype: datetime64[ns]

Converted to NumPy array with specified NA handling:
 [1 2 nan 4 '5' 'apple' True]

Converted to NumPy array using .values:
 [1 2 None 4 '5' 'apple' True]

Converted to DataFrame:
   column_name
0           1
1           2
2        None
3           4
4           5
5       apple
6        True

Series with categorical data type:
 0     apple
1    cherry
2    banana
3     apple
dtype: category
Categories (3, object): ['apple' < 'banana' < 'cherry']


# Manipulation Methods on Series

Before starting with manipulation methods in the pandas Series object, let us breifly understand the usage of the lambda function in Python along with map, filter, reduce!

## Lambda functions (in general)

Lambda functions, also known as anonymous functions, are small, single-expression functions defined with the `lambda` keyword in Python. Unlike standard functions created using `def`, lambda functions don’t require a name and are typically used for quick, throwaway functions that are needed for a short period.

### Basic Syntax of Lambda Functions

The syntax for a lambda function is as follows:

```python
lambda arguments: expression
```

- **Arguments**: Lambda functions can have any number of arguments (including zero).
- **Expression**: A single expression that the function evaluates and returns as the result. No explicit `return` statement is needed; the value of the expression is automatically returned.

### Example of a Lambda Function

Here’s a simple example:

```python
add = lambda x, y: x + y
print(add(2, 3))  # Output: 5
```

This creates an anonymous function to add two numbers and assigns it to the variable `add`. The function takes two arguments, `x` and `y`, and returns their sum.

## General syntax for `filter`, `map`, and `reduce` in Python, often used with lambda functions or other callable objects.

### 1. `filter(function, iterable)`

The `filter` function applies a filtering condition to an iterable. It returns only the items for which the condition evaluates to `True`.

```python
filtered_iterable = filter(function, iterable)
```

- **function**: A function that returns `True` or `False` for each item in the iterable.
- **iterable**: The data to filter (e.g., a list or tuple).
- **Returns**: An iterator of elements for which the function returns `True`.

**Example**:

```python
numbers = [1, 2, 3, 4, 5]
even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
print(even_numbers)  # Output: [2, 4]
```

### 2. `map(function, iterable)`

The `map` function applies a function to each item in an iterable and returns a map object with the results.

```python
mapped_iterable = map(function, iterable)
```

- **function**: A function to apply to each element of the iterable.
- **iterable**: The data to transform (e.g., a list or tuple).
- **Returns**: An iterator with the results of applying the function to each element.

**Example**:

```python
numbers = [1, 2, 3, 4]
squared_numbers = list(map(lambda x: x ** 2, numbers))
print(squared_numbers)  # Output: [1, 4, 9, 16]
```

### 3. `reduce(function, iterable, initializer=None)`

The `reduce` function applies a rolling computation to the elements of an iterable, reducing the iterable to a single cumulative result. `reduce` is found in the `functools` module.

```python
from functools import reduce
result = reduce(function, iterable, initializer)
```

- **function**: A function of two arguments, applied cumulatively to reduce the iterable.
- **iterable**: The data to reduce (e.g., a list or tuple).
- **initializer** (optional): A starting value for the reduction; if provided, it’s placed before the first element.
- **Returns**: A single value obtained by reducing the iterable.

**Example**:

```python
numbers = [1, 2, 3, 4]
product = reduce(lambda x, y: x * y, numbers)
print(product)  # Output: 24 (1 * 2 * 3 * 4)
```

### Summary

These functions are useful for data processing:

- **`filter`**: Selects elements based on a condition.
- **`map`**: Transforms each element.
- **`reduce`**: Aggregates all elements into a single result.

### Use Cases for Lambda Functions

Lambda functions are often used in places where short, simple functions are required. Common use cases include:

1. **Sorting with custom keys**:
   ```python
   points = [(1, 2), (3, 1), (5, -1)]
   points.sort(key=lambda x: x[1])
   print(points)  # Sorted by the second item: [(5, -1), (3, 1), (1, 2)]
   ```

2. **Filtering data**:
   ```python
   numbers = [1, 2, 3, 4, 5, 6]
   even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
   print(even_numbers)  # Output: [2, 4, 6]
   ```

3. **Mapping transformations**:
   ```python
   numbers = [1, 2, 3, 4]
   squares = list(map(lambda x: x ** 2, numbers))
   print(squares)  # Output: [1, 4, 9, 16]
   ```

4. **Combining with functions like `reduce`**:
   ```python
   from functools import reduce
   numbers = [1, 2, 3, 4]
   product = reduce(lambda x, y: x * y, numbers)
   print(product)  # Output: 24
   ```

### Comparison with `def` Functions

Lambda functions are syntactically restricted to a single expression, while `def` functions can contain multiple statements, more complex logic, and documentation strings.

**Lambda function**:
```python
double = lambda x: x * 2
```

**def function**:
```python
def double(x):
    return x * 2
```

### Advantages of Lambda Functions

- **Concise and Inline**: Useful for quick, small operations that can be defined in one line.
- **Anonymous**: Lambda functions can be created without names, making them convenient for immediate use and disposal.

### Limitations of Lambda Functions

- **Single Expression**: Lambda functions are limited to a single expression and can't contain statements.
- **Readability**: They can sometimes make the code harder to read, especially if overused or used in complex expressions.

### Using Lambda Functions with Higher-Order Functions

Lambda functions are commonly used with higher-order functions like `map`, `filter`, and `sorted`, as they allow you to create custom behaviors on-the-fly:

```python
# Using map with lambda
nums = [1, 2, 3, 4]
doubled = list(map(lambda x: x * 2, nums))
print(doubled)  # Output: [2, 4, 6, 8]
```

### Nesting Lambda Functions

Lambda functions can also be nested, which can be useful for creating custom key functions for sorting and grouping:

```python
# Nested lambda for sorting
students = [('Alice', 85), ('Bob', 70), ('Charlie', 90)]
sorted_students = sorted(students, key=lambda x: (x[1], x[0]))
print(sorted_students)  # Sorts by score first, then by name
```

### Practical Example: Lambda in Data Processing

Consider a list of dictionary entries, where we want to filter and transform certain data in a pipeline-like manner:

```python
people = [
    {'name': 'Alice', 'age': 28},
    {'name': 'Bob', 'age': 23},
    {'name': 'Charlie', 'age': 25},
]

# Filtering people aged 25 or older, then extracting names
names = list(map(lambda person: person['name'],
                 filter(lambda person: person['age'] >= 25, people)))
print(names)  # Output: ['Alice', 'Charlie']
```

### Conclusion

Lambda functions provide a convenient way to create small, simple functions without the need to formally define them. While useful in many scenarios, they’re best used sparingly in cases where a `def` function might be overkill, such as for short, one-off functions in data processing or as arguments to higher-order functions.