# Aggregate Operations on Series 

In [1]:
import pandas as pd 
import numpy as np

In [2]:
# Example Series
s = pd.Series([10, 20, 10, 40, 50, 60])

# 1. Aggregation (single function returns scalar, list of functions returns Series)
print("Aggregation with mean:", s.agg('mean'))
print("Aggregation with multiple functions:\n", s.agg(['mean', 'min', 'max']))

# 2. All: Returns True if every value is truthy
print("All values are truthy:", s.all())

# 3. Any: Returns True if at least one value is truthy
print("At least one value is truthy:", s.any())

# 4. Autocorrelation with lag=1
print("Autocorrelation:", s.autocorr(lag=1))

# 5. Correlation with another Series
s2 = pd.Series([10, 20, 30, 40, 50, 60])
print("Pearson correlation with s2:", s.corr(s2))

# 6. Covariance with another Series
print("Covariance with s2:", s.cov(s2))

# 7. Max: Returns maximum value
print("Maximum value:", s.max())

# 8. Min: Returns minimum value
print("Minimum value:", s.min())

# 9. Mean: Returns mean value
print("Mean value:", s.mean())

# 10. Median: Returns median value
print("Median value:", s.median())

# 11. Product: Returns product of values
print("Product of values:", s.prod())

# 12. Quantile: Returns specified quantile
print("50% Quantile:", s.quantile(q=0.5))
print("Quantiles: \n", s.quantile(q=[.1 , .5, .9])) # returns a series 

# 13. Standard Error of Mean (SEM)
print("Standard error of mean:", s.sem())

# 14. Standard Deviation
print("Standard deviation:", s.std())

# 15. Variance
print("Variance:", s.var())

# 16. Skewness
print("Skewness:", s.skew())

# 17. Kurtosis
print("Kurtosis:", s.kurtosis())

# 18. Count of unique items
print("Count of unique items:", s.nunique())

# 19. Count of non-missing items
print("Count of non-missing items:", s.count())

# 20. Size: Number of items in Series
print("Size of Series:", s.size)

# 21. Is Unique: Check if all values are unique
print("All values are unique:", s.is_unique)

# 23. Is Monotonic Increasing
print("Values are monotonic increasing:", s.is_monotonic_increasing)

# 24. Is Monotonic Decreasing
print("Values are monotonic decreasing:", s.is_monotonic_decreasing)

Aggregation with mean: 31.666666666666668
Aggregation with multiple functions:
 mean    31.666667
min     10.000000
max     60.000000
dtype: float64
All values are truthy: True
At least one value is truthy: True
Autocorrelation: 0.7433046224826585
Pearson correlation with s2: 0.9254821475438165
Covariance with s2: 370.0
Maximum value: 60
Minimum value: 10
Mean value: 31.666666666666668
Median value: 30.0
Product of values: 240000000
50% Quantile: 30.0
Quantiles: 
 0.1    10.0
0.5    30.0
0.9    55.0
dtype: float64
Standard error of mean: 8.724168218868268
Standard deviation: 21.36976056643281
Variance: 456.6666666666667
Skewness: 0.23226763043061902
Kurtosis: -2.149821514198946
Count of unique items: 5
Count of non-missing items: 6
Size of Series: 6
All values are unique: False
Values are monotonic increasing: False
Values are monotonic decreasing: False


# Conversion Methods on Series 

In [14]:
# using astype method 
series_1 = pd.Series(data=[12,3,4,5,6,7,8,9])
display(series_1.dtype) # default is int64

# check the range of int64 using numpy 
display(np.iinfo('int64')) #iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)
# clearly this is waste of memory 
display(np.iinfo('int8'))#iinfo(min=-128, max=127, dtype=int8)
# Well! int8 is the best fir for our data 

# convert to int8 
series_1 = series_1.astype('int8')
display(series_1.dtype) # int8

# notics that our data has only positive values and if we know that negative values are not coming then we have uint8
np.iinfo('uint8') # iinfo(min=0, max=255, dtype=uint8)

# convert to uint8 
series_1 = series_1.astype('uint8')
display(series_1.dtype) # uint8

dtype('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

iinfo(min=-128, max=127, dtype=int8)

dtype('int8')

dtype('uint8')

In [16]:
# we also have float16 and float64
import numpy as np

info = np.finfo(np.float64)
print("Resolution:", info.resolution)
print("Minimum:", info.min)
print("Maximum:", info.max)
print("Data Type:", info.dtype)


Resolution: 1e-15
Minimum: -1.7976931348623157e+308
Maximum: 1.7976931348623157e+308
Data Type: float64


The `finfo` function in NumPy provides information about floating-point data types, specifically their limits and properties. Here’s a breakdown of the parameters in this example:

### Parameters and Properties

1. **`resolution=1e-15`**:
   - This is the **smallest possible difference** between two distinct floating-point numbers of this type (in this case, `float64`) that can still be represented accurately. The resolution here is set to `1e-15`, meaning that numbers smaller than this difference may be indistinguishable or rounded off due to precision limits. For `float64`, this is typically close to machine epsilon, which is approximately `2.22e-16`.

2. **`min=-1.7976931348623157e+308`**:
   - This is the **smallest (most negative)** number that can be represented by a `float64` data type. Any number below this will underflow and be represented as `-inf` (negative infinity). For `float64`, this limit is roughly `-1.7976931348623157 × 10^308`.

3. **`max=1.7976931348623157e+308`**:
   - This is the **largest (most positive)** number that can be represented by a `float64` data type. Any number above this will overflow and be represented as `inf` (positive infinity). The limit for `float64` is approximately `1.7976931348623157 × 10^308`.

4. **`dtype=float64`**:
   - This specifies the **data type** for which the `finfo` information is given, which in this case is `float64`. `float64` refers to a 64-bit floating-point number, which is the standard for representing floating-point numbers in most computing applications.


In [None]:
series_1 = series_1.astype('float16')
print(series_1.dtype)

# float16
np.finfo('float16') # finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

# note: The resolution takes a large hit! 

float16


finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

## Why Conversion matters? - Memory Usage Optimization!

To calculate the memory usage of a Pandas Series, you can use either the `.nbytes` property or the `.memory_usage()` method:

`.nbytes`:

- Shows the memory consumed by the data in the Series alone.
- When working with numeric data, changing data types (e.g., from default integer to Int16) can reduce memory usage significantly.

In [None]:
series_2 = pd.Series(data=[123, 222, 234, 44, 67, 88, 99, 100, 34, 11, 1001])
display(series_2.astype('int64').nbytes) # 88
display(series_2.astype('int16').nbytes)  # 22

88

22

`.memory_usage()`:

- Includes memory consumed by the Series index and, when used with deep=True, counts the memory of objects stored in the Series.
- For object types (like strings), deep=True is needed to include the memory of individual Python objects.

In [29]:
series_3 = pd.Series(data=['banana', 'orange', 'lichi', 'watermelon', 'guava', 'pineapple', 'banana', 'orange', 'lichi', 'watermelon','lichi', 'watermelon', 'guava','banana', 'orange','pineapple', 'banana', 'orange', 'lichi'])

series_3.memory_usage(deep=True)

1341

Memory Optimization with category Data Type:

- Converting object types (like strings) to category can significantly reduce memory usage, as each unique value is stored only once.

In [30]:
series_3.astype('category').memory_usage(deep=True)

706

In [31]:
# Sample DataFrame
df = pd.DataFrame({
    'numeric_str': ['1', '2', '3', None],
    'string_obj': ['apple', 'banana', 'cherry', None],
    'int_col': [1, 2, 3, 4],
    'float_col': [1.1, 2.2, 3.3, None],
})

# Converting data types

# 1. Converting to Python string type
df['numeric_str'] = df['numeric_str'].astype('str')
print("Converted to Python string:\n", df['numeric_str'])

# 2. Converting to Pandas string type (supports pd.NA for missing values)
df['string_obj'] = df['string_obj'].astype('string')
print("\nConverted to Pandas string (supports pd.NA):\n", df['string_obj'])

# 3. Converting to NumPy int64
df['int_col'] = df['int_col'].astype('int64')
print("\nConverted to NumPy int64:\n", df['int_col'])

# 4. Converting to 32-bit signed integer
df['int_col'] = df['int_col'].astype('int32')
print("\nConverted to NumPy int32:\n", df['int_col'])

# 5. Converting to Pandas Int64 (supports pd.NA)
df['int_col_nullable'] = df['numeric_str'].astype('Int64', errors='ignore')
print("\nConverted to Pandas Int64 (supports pd.NA):\n", df['int_col_nullable'])

# 6. Converting to NumPy float64
df['float_col'] = df['float_col'].astype('float64')
print("\nConverted to NumPy float64:\n", df['float_col'])

# 7. Converting to categorical (supports pd.NA)
df['string_obj'] = df['string_obj'].astype('category')
print("\nConverted to categorical:\n", df['string_obj'])

# 8. Converting to datetime (not through astype, use pd.to_datetime)
df['date_str'] = ['2023-01-01', '2023-02-01', None, '2023-04-01']
df['date_col'] = pd.to_datetime(df['date_str'])
print("\nConverted to datetime:\n", df['date_col'])

Converted to Python string:
 0       1
1       2
2       3
3    None
Name: numeric_str, dtype: object

Converted to Pandas string (supports pd.NA):
 0     apple
1    banana
2    cherry
3      <NA>
Name: string_obj, dtype: string

Converted to NumPy int64:
 0    1
1    2
2    3
3    4
Name: int_col, dtype: int64

Converted to NumPy int32:
 0    1
1    2
2    3
3    4
Name: int_col, dtype: int32

Converted to Pandas Int64 (supports pd.NA):
 0       1
1       2
2       3
3    None
Name: int_col_nullable, dtype: object

Converted to NumPy float64:
 0    1.1
1    2.2
2    3.3
3    NaN
Name: float_col, dtype: float64

Converted to categorical:
 0     apple
1    banana
2    cherry
3      <NA>
Name: string_obj, dtype: category
Categories (3, string): [apple, banana, cherry]

Converted to datetime:
 0   2023-01-01
1   2023-02-01
2          NaT
3   2023-04-01
Name: date_col, dtype: datetime64[ns]


In [32]:
# Some other conversions

# Sample Series
s = pd.Series([1, 2, None, 4, '5', 'apple', True])

# 1. Convert types using .convert_dtypes (Pandas 1.x types supporting pd.NA)
s_converted = s.convert_dtypes(
    infer_objects=True,
    convert_string=True,
    convert_integer=True,
    convert_boolean=True,
    convert_floating=True
)
print("Converted to appropriate pandas types (supports NA):\n", s_converted)

# 2. Cast Series to a specific type using .astype
s_int = pd.Series(['1', '2', None, '4']).astype(dtype='Int64', errors='ignore')
print("\nCasted to pandas Int64 with NA support:\n", s_int)

# 3. Convert to datetime using pd.to_datetime
date_series = pd.Series(['2023-01-01', '2023-02-01', None, '2023-04-01'])
date_converted = pd.to_datetime(date_series, errors='raise', dayfirst=False, yearfirst=False)
print("\nConverted to datetime:\n", date_converted)

# 4. Convert Series to NumPy array using .to_numpy
numpy_array = s.to_numpy(dtype=object, copy=True, na_value=np.nan)
print("\nConverted to NumPy array with specified NA handling:\n", numpy_array)

# 5. Convert Series to NumPy array using .values (similar to .to_numpy)
numpy_values = s.values
print("\nConverted to NumPy array using .values:\n", numpy_values)

# 6. Convert Series to DataFrame using .to_frame
s_dataframe = s.to_frame(name='column_name')
print("\nConverted to DataFrame:\n", s_dataframe)

# 7. Define categorical data type using pd.CategoricalDtype
cat_type = pd.CategoricalDtype(categories=['apple', 'banana', 'cherry'], ordered=True)
s_cat = pd.Series(['apple', 'cherry', 'banana', 'apple'], dtype=cat_type)
print("\nSeries with categorical data type:\n", s_cat)

Converted to appropriate pandas types (supports NA):
 0        1
1        2
2     None
3        4
4        5
5    apple
6     True
dtype: object

Casted to pandas Int64 with NA support:
 0       1
1       2
2    <NA>
3       4
dtype: Int64

Converted to datetime:
 0   2023-01-01
1   2023-02-01
2          NaT
3   2023-04-01
dtype: datetime64[ns]

Converted to NumPy array with specified NA handling:
 [1 2 nan 4 '5' 'apple' True]

Converted to NumPy array using .values:
 [1 2 None 4 '5' 'apple' True]

Converted to DataFrame:
   column_name
0           1
1           2
2        None
3           4
4           5
5       apple
6        True

Series with categorical data type:
 0     apple
1    cherry
2    banana
3     apple
dtype: category
Categories (3, object): ['apple' < 'banana' < 'cherry']


In [2]:
s = pd.Series([3.141, 2.718, 1.414])
print(s.round(1))     # Round to 1 decimal
print(s.clip(2, 3))   # Force values between 2 and 3

0    3.1
1    2.7
2    1.4
dtype: float64
0    3.000
1    2.718
2    2.000
dtype: float64


# Manipulation Methods on Series

Before starting with manipulation methods in the pandas Series object, let us breifly understand the usage of the lambda function in Python along with map, filter, reduce!

## Lambda functions (in general)

Lambda functions, also known as anonymous functions, are small, single-expression functions defined with the `lambda` keyword in Python. Unlike standard functions created using `def`, lambda functions don’t require a name and are typically used for quick, throwaway functions that are needed for a short period.

### Basic Syntax of Lambda Functions

The syntax for a lambda function is as follows:

```python
lambda arguments: expression
```

- **Arguments**: Lambda functions can have any number of arguments (including zero).
- **Expression**: A single expression that the function evaluates and returns as the result. No explicit `return` statement is needed; the value of the expression is automatically returned.

### Example of a Lambda Function

Here’s a simple example:

```python
add = lambda x, y: x + y
print(add(2, 3))  # Output: 5
```

This creates an anonymous function to add two numbers and assigns it to the variable `add`. The function takes two arguments, `x` and `y`, and returns their sum.

## General syntax for `filter`, `map`, and `reduce` in Python, often used with lambda functions or other callable objects.

### 1. `filter(function, iterable)`

The `filter` function applies a filtering condition to an iterable. It returns only the items for which the condition evaluates to `True`.

```python
filtered_iterable = filter(function, iterable)
```

- **function**: A function that returns `True` or `False` for each item in the iterable.
- **iterable**: The data to filter (e.g., a list or tuple).
- **Returns**: An iterator of elements for which the function returns `True`.

**Example**:

```python
numbers = [1, 2, 3, 4, 5]
even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
print(even_numbers)  # Output: [2, 4]
```

### 2. `map(function, iterable)`

The `map` function applies a function to each item in an iterable and returns a map object with the results.

```python
mapped_iterable = map(function, iterable)
```

- **function**: A function to apply to each element of the iterable.
- **iterable**: The data to transform (e.g., a list or tuple).
- **Returns**: An iterator with the results of applying the function to each element.

**Example**:

```python
numbers = [1, 2, 3, 4]
squared_numbers = list(map(lambda x: x ** 2, numbers))
print(squared_numbers)  # Output: [1, 4, 9, 16]
```

### 3. `reduce(function, iterable, initializer=None)`

The `reduce` function applies a rolling computation to the elements of an iterable, reducing the iterable to a single cumulative result. `reduce` is found in the `functools` module.

```python
from functools import reduce
result = reduce(function, iterable, initializer)
```

- **function**: A function of two arguments, applied cumulatively to reduce the iterable.
- **iterable**: The data to reduce (e.g., a list or tuple).
- **initializer** (optional): A starting value for the reduction; if provided, it’s placed before the first element.
- **Returns**: A single value obtained by reducing the iterable.

**Example**:

```python
numbers = [1, 2, 3, 4]
product = reduce(lambda x, y: x * y, numbers)
print(product)  # Output: 24 (1 * 2 * 3 * 4)
```

### Summary

These functions are useful for data processing:

- **`filter`**: Selects elements based on a condition.
- **`map`**: Transforms each element.
- **`reduce`**: Aggregates all elements into a single result.

### Use Cases for Lambda Functions

Lambda functions are often used in places where short, simple functions are required. Common use cases include:

1. **Sorting with custom keys**:
   ```python
   points = [(1, 2), (3, 1), (5, -1)]
   points.sort(key=lambda x: x[1])
   print(points)  # Sorted by the second item: [(5, -1), (3, 1), (1, 2)]
   ```

2. **Filtering data**:
   ```python
   numbers = [1, 2, 3, 4, 5, 6]
   even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
   print(even_numbers)  # Output: [2, 4, 6]
   ```

3. **Mapping transformations**:
   ```python
   numbers = [1, 2, 3, 4]
   squares = list(map(lambda x: x ** 2, numbers))
   print(squares)  # Output: [1, 4, 9, 16]
   ```

4. **Combining with functions like `reduce`**:
   ```python
   from functools import reduce
   numbers = [1, 2, 3, 4]
   product = reduce(lambda x, y: x * y, numbers)
   print(product)  # Output: 24
   ```

### Comparison with `def` Functions

Lambda functions are syntactically restricted to a single expression, while `def` functions can contain multiple statements, more complex logic, and documentation strings.

**Lambda function**:
```python
double = lambda x: x * 2
```

**def function**:
```python
def double(x):
    return x * 2
```

### Advantages of Lambda Functions

- **Concise and Inline**: Useful for quick, small operations that can be defined in one line.
- **Anonymous**: Lambda functions can be created without names, making them convenient for immediate use and disposal.

### Limitations of Lambda Functions

- **Single Expression**: Lambda functions are limited to a single expression and can't contain statements.
- **Readability**: They can sometimes make the code harder to read, especially if overused or used in complex expressions.

### Using Lambda Functions with Higher-Order Functions

Lambda functions are commonly used with higher-order functions like `map`, `filter`, and `sorted`, as they allow you to create custom behaviors on-the-fly:

```python
# Using map with lambda
nums = [1, 2, 3, 4]
doubled = list(map(lambda x: x * 2, nums))
print(doubled)  # Output: [2, 4, 6, 8]
```

### Nesting Lambda Functions

Lambda functions can also be nested, which can be useful for creating custom key functions for sorting and grouping:

```python
# Nested lambda for sorting
students = [('Alice', 85), ('Bob', 70), ('Charlie', 90)]
sorted_students = sorted(students, key=lambda x: (x[1], x[0]))
print(sorted_students)  # Sorts by score first, then by name
```

### Practical Example: Lambda in Data Processing

Consider a list of dictionary entries, where we want to filter and transform certain data in a pipeline-like manner:

```python
people = [
    {'name': 'Alice', 'age': 28},
    {'name': 'Bob', 'age': 23},
    {'name': 'Charlie', 'age': 25},
]

# Filtering people aged 25 or older, then extracting names
names = list(map(lambda person: person['name'],
                 filter(lambda person: person['age'] >= 25, people)))
print(names)  # Output: ['Alice', 'Charlie']
```

### Conclusion

Lambda functions provide a convenient way to create small, simple functions without the need to formally define them. While useful in many scenarios, they’re best used sparingly in cases where a `def` function might be overkill, such as for short, one-off functions in data processing or as arguments to higher-order functions.

In [35]:
df = pd.read_csv('./Data/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [59]:
series_age = df['Age'] # this is a Series object 
display(type(series_age))

# I want to get a list of all those age which are > 45
series_age[series_age.apply(lambda age: age>45)].values

pandas.core.series.Series

array([54. , 58. , 55. , 66. , 49. , 65. , 46. , 59. , 71. , 47. , 70.5,
       54. , 47. , 51. , 55.5, 51. , 61. , 56. , 50. , 58. , 45.5, 51. ,
       59. , 54. , 62. , 50. , 52. , 58. , 63. , 65. , 50. , 54. , 61. ,
       45.5, 60. , 46. , 51. , 50. , 64. , 52. , 49. , 65. , 50. , 48. ,
       47. , 48. , 56. , 50. , 63. , 58. , 55. , 71. , 54. , 54. , 47. ,
       50. , 50. , 64. , 62. , 48. , 62. , 53. , 54. , 47. , 60. , 52. ,
       47. , 49. , 49. , 61. , 57. , 80. , 51. , 48. , 56. , 58. , 50. ,
       47. , 70. , 60. , 60. , 52. , 49. , 48. , 52. , 50. , 48. , 70. ,
       48. , 51. , 48. , 57. , 54. , 46. , 49. , 52. , 62. , 74. , 51. ,
       48. , 47. , 47. , 56. ])

In [None]:
series_age[series_age.gt(45)].values # better way since it is vectorized 

array([54. , 58. , 55. , 66. , 49. , 65. , 46. , 59. , 71. , 47. , 70.5,
       54. , 47. , 51. , 55.5, 51. , 61. , 56. , 50. , 58. , 45.5, 51. ,
       59. , 54. , 62. , 50. , 52. , 58. , 63. , 65. , 50. , 54. , 61. ,
       45.5, 60. , 46. , 51. , 50. , 64. , 52. , 49. , 65. , 50. , 48. ,
       47. , 48. , 56. , 50. , 63. , 58. , 55. , 71. , 54. , 54. , 47. ,
       50. , 50. , 64. , 62. , 48. , 62. , 53. , 54. , 47. , 60. , 52. ,
       47. , 49. , 49. , 61. , 57. , 80. , 51. , 48. , 56. , 58. , 50. ,
       47. , 70. , 60. , 60. , 52. , 49. , 48. , 52. , 50. , 48. , 70. ,
       48. , 51. , 48. , 57. , 54. , 46. , 49. , 52. , 62. , 74. , 51. ,
       48. , 47. , 47. , 56. ])

The method `.apply()` allows you to apply a function element-wise to every value. If you pass in a NumPy function that works on an array, it will broadcast the operation to the series.

Because the `.apply` method typically operates on each individual value in the series, the function is called once for every value. If you have one million values in a series, it will be called one million times. It breaks out of the fast vectorized code paths we can leverage in pandas and puts us back to using slow Python code.

## .where() method
The `.where()` method in pandas is used to conditionally replace values in a DataFrame or Series, similar to applying an "if" condition. If a condition is met, it keeps the original value; otherwise, it replaces it with a specified alternative.

### Syntax
```python
DataFrame.where(cond, other=NaN, inplace=False, axis=None, level=None, errors='raise', try_cast=False)
```

- `cond`: Condition to apply (boolean DataFrame/Series).
- `other`: Value to replace where the condition is False.
- `inplace`: If True, performs the operation in place.
- `axis`: Axis along which to perform the operation.

In [69]:
result = df['Age'].where(df['Age'].gt(45),
                         other=0).values
filtered_result = list(filter(lambda x:x>0, result))
display(filtered_result)

[54.0,
 58.0,
 55.0,
 66.0,
 49.0,
 65.0,
 46.0,
 59.0,
 71.0,
 47.0,
 70.5,
 54.0,
 47.0,
 51.0,
 55.5,
 51.0,
 61.0,
 56.0,
 50.0,
 58.0,
 45.5,
 51.0,
 59.0,
 54.0,
 62.0,
 50.0,
 52.0,
 58.0,
 63.0,
 65.0,
 50.0,
 54.0,
 61.0,
 45.5,
 60.0,
 46.0,
 51.0,
 50.0,
 64.0,
 52.0,
 49.0,
 65.0,
 50.0,
 48.0,
 47.0,
 48.0,
 56.0,
 50.0,
 63.0,
 58.0,
 55.0,
 71.0,
 54.0,
 54.0,
 47.0,
 50.0,
 50.0,
 64.0,
 62.0,
 48.0,
 62.0,
 53.0,
 54.0,
 47.0,
 60.0,
 52.0,
 47.0,
 49.0,
 49.0,
 61.0,
 57.0,
 80.0,
 51.0,
 48.0,
 56.0,
 58.0,
 50.0,
 47.0,
 70.0,
 60.0,
 60.0,
 52.0,
 49.0,
 48.0,
 52.0,
 50.0,
 48.0,
 70.0,
 48.0,
 51.0,
 48.0,
 57.0,
 54.0,
 46.0,
 49.0,
 52.0,
 62.0,
 74.0,
 51.0,
 48.0,
 47.0,
 47.0,
 56.0]

The .where() method is ideal for masking specific values based on complex conditions, retaining more control over data transformations compared to basic indexing or filtering.

In [71]:
# Replace values in 'A' less than 30 or in 'B' greater than 45 with NaN
result = df['Age'].where((df['Age'] >= 30) & (df['Age'] <= 45),
                         other=0)
print(result)

0       0.0
1      38.0
2       0.0
3      35.0
4      35.0
       ... 
886     0.0
887     0.0
888     0.0
889     0.0
890    32.0
Name: Age, Length: 891, dtype: float64


The `mask()` function in pandas is similar to `.where()` but works in the opposite way. It replaces values in a DataFrame or Series where a specified condition is False instead of True.

```python
DataFrame.mask(cond, other=NaN, inplace=False, axis=None, level=None, errors='raise', try_cast=False)
```
- cond: A condition (boolean DataFrame/Series) specifying where to replace values.
- other: The value to replace where the condition is True.
- inplace: If True, modifies the data in place.
- axis: The axis along which to apply the operation.

In [72]:
result = df['Age'].mask(df['Age'].gt(45),
                         other=0).values
filtered_result = list(filter(lambda x:x>0, result))
display(filtered_result)

[22.0,
 38.0,
 26.0,
 35.0,
 35.0,
 2.0,
 27.0,
 14.0,
 4.0,
 20.0,
 39.0,
 14.0,
 2.0,
 31.0,
 35.0,
 34.0,
 15.0,
 28.0,
 8.0,
 38.0,
 19.0,
 40.0,
 28.0,
 42.0,
 21.0,
 18.0,
 14.0,
 40.0,
 27.0,
 3.0,
 19.0,
 18.0,
 7.0,
 21.0,
 29.0,
 21.0,
 28.5,
 5.0,
 11.0,
 22.0,
 38.0,
 45.0,
 4.0,
 29.0,
 19.0,
 17.0,
 26.0,
 32.0,
 16.0,
 21.0,
 26.0,
 32.0,
 25.0,
 0.83,
 30.0,
 22.0,
 29.0,
 28.0,
 17.0,
 33.0,
 16.0,
 23.0,
 24.0,
 29.0,
 20.0,
 26.0,
 23.0,
 34.0,
 34.0,
 28.0,
 21.0,
 33.0,
 37.0,
 28.0,
 21.0,
 38.0,
 14.5,
 22.0,
 20.0,
 17.0,
 21.0,
 29.0,
 24.0,
 2.0,
 21.0,
 32.5,
 32.5,
 12.0,
 24.0,
 45.0,
 33.0,
 20.0,
 29.0,
 25.0,
 23.0,
 19.0,
 37.0,
 16.0,
 24.0,
 22.0,
 24.0,
 19.0,
 18.0,
 19.0,
 27.0,
 9.0,
 36.5,
 42.0,
 22.0,
 40.5,
 16.0,
 30.0,
 44.0,
 40.0,
 26.0,
 17.0,
 1.0,
 9.0,
 45.0,
 28.0,
 4.0,
 1.0,
 21.0,
 18.0,
 30.0,
 36.0,
 9.0,
 1.0,
 4.0,
 45.0,
 40.0,
 36.0,
 32.0,
 19.0,
 19.0,
 3.0,
 44.0,
 42.0,
 24.0,
 28.0,
 34.0,
 18.0,
 2.0,
 32.0,
 26.0,
 16.

### How to write if-elif-else in Pandas?

In [78]:
# if-else in pandas 
df['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

### Age Group Division

1. **Child**: 0–12 years
2. **Teen**: 13–17 years
3. **Young Adult**: 18–35 years
4. **Middle-Aged Adult**: 36–55 years
5. **Senior Adult**: 56–75 years
6. **Elderly**: 76+ years

In [None]:
# np.select(condlist, choicelist, default=0)
df['Age Category'] = pd.Series(
    np.select([df['Age'].le(12),
               np.logical_and(df['Age'].ge(13), df['Age'].le(17)),
               np.logical_and(df['Age'].ge(18), df['Age'].le(35)),
               np.logical_and(df['Age'].ge(36), df['Age'].le(55)),
               np.logical_and(df['Age'].ge(56), df['Age'].le(75)),
               df['Age'].ge(76)
    ],
              ['Child',
               'Teen',
               'Young Adult',
               'Middle-Aged Adult',
               'Senior Adult',
               'Elderly'
              ],
              'Other')
)

In [85]:
df['Age Category'].value_counts()

Age Category
Young Adult          384
Other                178
Middle-Aged Adult    177
Child                 69
Teen                  44
Senior Adult          38
Elderly                1
Name: count, dtype: int64

## Handling Missing Data

In [88]:
# which are the columns having missing data?
df.columns[df.isna().any()]

Index(['Age', 'Cabin', 'Embarked'], dtype='object')

In [89]:
# consider the cabin series 
df['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [90]:
# how many null values are present?
df['Cabin'].isna().sum()

687

In [95]:
# say, we want to find the values of Embarked for the missing values of the cabin

df['Embarked'].loc[df['Cabin'].isna()]

0      S
2      S
4      S
5      Q
7      S
      ..
884    S
885    Q
886    S
888    S
890    Q
Name: Embarked, Length: 687, dtype: object

In [94]:
# suppose i want to get the index 
df.index[df['Cabin'].isna()]

Index([  0,   2,   4,   5,   7,   8,   9,  12,  13,  14,
       ...
       878, 880, 881, 882, 883, 884, 885, 886, 888, 890],
      dtype='int64', length=687)

In [100]:
# filling in missing data 
# let us fill the missing dat

df['Age'].fillna(df['Age'].agg('mean')).unique()

array([22.        , 38.        , 26.        , 35.        , 29.69911765,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [101]:
df['Cabin'].fillna('Other').value_counts()

Cabin
Other          687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 148, dtype: int64

## Handling Outliers 

The `clip()` method in pandas is used to limit the values in a DataFrame or Series to within a specified range. This method is useful for capping values at a minimum or maximum threshold, which can help handle outliers or restrict data within a specific range.

### Syntax

```python
DataFrame.clip(lower=None, upper=None, axis=None, inplace=False)
```

- **lower**: The minimum threshold. Values below this will be set to `lower`.
- **upper**: The maximum threshold. Values above this will be set to `upper`.
- **axis**: Specifies which axis to apply the clipping to (default is 0).
- **inplace**: If `True`, modifies the original data instead of creating a new one.

### Example Usage

1. **Basic Clipping with Lower and Upper Limits**

   ```python
   import pandas as pd

   data = pd.Series([1, 5, 10, 15, 20])
   clipped_data = data.clip(lower=5, upper=15)
   print(clipped_data)
   ```

   **Output:**
   ```
   0     5
   1     5
   2    10
   3    15
   4    15
   dtype: int64
   ```

   - In this example, values below `5` are set to `5` (like the value `1`), and values above `15` are set to `15` (like the value `20`).

2. **Clipping Only the Upper Limit**

   ```python
   data = pd.Series([2, 4, 6, 8, 10])
   clipped_data = data.clip(upper=6)
   print(clipped_data)
   ```

   **Output:**
   ```
   0    2
   1    4
   2    6
   3    6
   4    6
   dtype: int64
   ```

   - Here, only the `upper` limit is specified. Values above `6` are capped at `6`.

3. **Clipping Using Lower and Upper Values with DataFrames**

   ```python
   df = pd.DataFrame({
       'A': [1, 6, 11, 16],
       'B': [2, 7, 12, 17]
   })
   clipped_df = df.clip(lower=5, upper=15)
   print(clipped_df)
   ```

   **Output:**
   ```
       A   B
   0   5   5
   1   6   7
   2  11  12
   3  15  15
   ```

   - Each value is clipped between `5` and `15`. Values below `5` are replaced with `5`, and values above `15` are replaced with `15`.

The `clip()` method is useful in data preprocessing when you need to restrict data values to a specific range, such as setting a maximum cap on outliers.

In [111]:
df['Age'].unique()

array([22.        , 38.        , 26.        , 35.        , 29.69911765,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [112]:
# remove outliers from age 
df["Age"] = df['Age'].fillna(df["Age"].agg('mean'))
df['Age'].clip(
    lower=df['Age'].quantile(0.25),
    upper=df['Age'].quantile(0.75),
    inplace=True
)
df['Age'].unique()

array([22.        , 35.        , 26.        , 29.69911765, 27.        ,
       31.        , 34.        , 28.        , 29.        , 28.5       ,
       32.        , 25.        , 30.        , 33.        , 23.        ,
       24.        , 32.5       , 23.5       , 24.5       , 30.5       ,
       34.5       ])

## Sorting values using sort_values()

In [None]:
series_4 = pd.Series(data=[5,44,23,67,88,90,23,14,56,22,556,34,2])
series_4.sort_values(ascending=False)

10    556
5      90
4      88
3      67
8      56
1      44
11     34
2      23
6      23
9      22
7      14
0       5
12      2
dtype: int64

In [116]:
series_4.sort_values(key=lambda x: -2*(x**2)+20*x+20, ascending=False)

0       5
12      2
7      14
9      22
2      23
6      23
11     34
1      44
8      56
3      67
4      88
5      90
10    556
dtype: int64

## Dealing with Duplicates 

In [None]:
series_5 = pd.Series(data=[10,20,20,30,40,50,50,60])
display(series_5.drop_duplicates())
display(series_5.drop_duplicates(keep='last')) # keeps the last occurence of the duplicated value - note the index 
display(series_5.drop_duplicates(keep='first'))
display(series_5.drop_duplicates(keep=False))

0    10
1    20
3    30
4    40
5    50
7    60
dtype: int64

0    10
2    20
3    30
4    40
6    50
7    60
dtype: int64

0    10
1    20
3    30
4    40
5    50
7    60
dtype: int64

0    10
3    30
4    40
7    60
dtype: int64

## Ranking Data 
Assigns ranks to elements in the Series. Useful for finding relative positions.

In [3]:
s = pd.Series([100, 200, 50, 100, 300])
s.rank()

0    2.5
1    4.0
2    1.0
3    2.5
4    5.0
dtype: float64

## Replacing Values 
Replaces specific values in the Series.

In [None]:
s = pd.Series([1, 2, 3, 4, 5])
s.replace({1: 100, 2: 200})

0    100
1    200
2      3
3      4
4      5
dtype: int64

## Binning Values - `pd.cut()`
Bins values into discrete intervals you define (manual buckets). Great for categorizing continuous data.

## Quantile-based Binning - `pd.qcut()`
Quantile-based binning. Automatically divides data into equal-sized groups based on distribution (percentiles).