In [None]:
### Author: Anuvrat Chaturvedi
### Date: 13-Feb-2024
### Purpose: Common helper function for the project

### Helper Functions Used Across Project


In [2]:
# Library definitions required for helper functions
import pandas as pd
import numpy as np
import re  # Built-in. No package installation required
import plotly.graph_objects as go

In [4]:
# Convert object to float eliminating symbols like +-,*
def obt_to_float(ser: pd.Series) -> pd.Series:  # Typehinting implemented
    """
    Description:
    Convert a pandas Series of objects to float, eliminating symbols like +, -, *, and commas.
    - Removes unwanted characters: Commas, plus signs, minus signs, asterisks, and empty strings are removed.
    - Handles missing values: Empty strings are replaced with NaN for clarity.
    - Converts to float: The cleaned Series is converted to float data type.

    Parameters:
    - ser (pd.Series): The input pandas Series containing object-type values to be converted to float.

    Returns:
    - pd.Series: A new pandas Series containing the converted float values.

    Raises:
    - ValueError: If any values cannot be converted to float.

    Example:
    >>> import pandas as pd
    >>> data = pd.Series(["1,000", "-500", "2.5", "3*2"])
    >>> result = obt_to_float(data)
    >>> print(result)
    0    1000.0
    1    -500.0
    2       2.5
    3       3.0
    dtype: float64
    """

    ser_float = (
        ser.str.replace(",", "")
        .str.replace("[+\-\*]+", "", regex=True)
        .replace("", np.nan)
        .astype(float)
    )
    return ser_float

In [8]:
# Check conversion from objet to float for special cases:
def qc_obj_to_float(oldser: pd.Series, newser: pd.Series) -> None:
    """
    Description:
    This function compares the conversion of values from an old pandas Series (oldser) to a new pandas Series (newser).
    It helps with quality control by highlighting values that might have been transformed unexpectedly during conversion.
    It specifically checks for special cases where the original values contain symbols like '+', '-', or '*' which may impact the conversion.

    Warning:
    This function doesn't raise exceptions but prints warnings for special cases during the conversion process.

    Parameters:
    - oldser (pd.Series): The original pandas Series with object-type values.
    - newser (pd.Series): The pandas Series after attempting the conversion to float.

    Returns:
    None

    Example:
    >>> import pandas as pd
    >>> old_data = pd.Series(["1,000", "-500", "2.5", "3*2"])
    >>> new_data = obt_to_float(old_data)
    >>> qc_obj_to_float(old_data, new_data)
    Index: 1,     Old value: -500,     New value: -500.0
    Index: 3,     Old value: 3*2,      New value: NaN
    """

    tmp_counter = 0
    for _ in oldser:
        if ("+" in _) or ("-" in _ and len(_) > 1) or ("*" in _):
            print(
                f"Index: {tmp_counter}, \t Old value: {oldser[tmp_counter]}, \t \
                New value: {newser[tmp_counter]}"
            )
        tmp_counter += 1

In [9]:
# Check % of missing numeric values in a pandas dataframe
def perc_missing_num(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the percentage of missing numeric values in a pandas DataFrame.

    Parameters:
    - df (pd.DataFrame): The input pandas DataFrame.

    Returns:
    - pd.DataFrame: A DataFrame summarizing the number and percentage of missing values for each numeric column.

    Example:
    >>> import pandas as pd
    >>> data = {'A': [1, 2, None, 4], 'B': [5.0, None, 7.0, 8.0], 'C': [9, 10, 11, 12]}
    >>> df = pd.DataFrame(data)
    >>> result = perc_missing_num(df)
    >>> print(result)
       column  number_missing  percent_missing
    0      A               1             25.0
    1      B               1             25.0
    2      C               0              0.0
    """

    number_missing = df.isnull().sum()
    percent_missing = number_missing * 100 / len(df)
    missing_value_df = (
        pd.DataFrame(
            {
                "column": df.columns,
                "number_missing": number_missing,
                "percent_missing": percent_missing,
            }
        )
        .reset_index()
        .drop(columns="index")
    )
    print(f"Number of records in dataset: {len(df)}")
    print("Missing records summary:")
    return missing_value_df.sort_values("percent_missing")

In [10]:
# Create SPLOMs using plotly for given dataframe and features
def createSPLOM(
    inpdf: pd.DataFrame, label_value_dict: dict, text: str, color: str, title: str
) -> None:
    """
    Create a Scatter Plot Matrix (SPLOM) using Plotly for visualizing relationships between numerical features in a DataFrame.

    Args:
    - inpdf (pd.DataFrame): The input DataFrame containing the data to visualize.
    - label_value_dict (dict): A dictionary mapping feature names to their corresponding column names in the DataFrame.
    - text (str): The column name in the DataFrame to use for hover text on the SPLOM points.
    - color (str): The column name in the DataFrame to use for coloring the SPLOM points.
    - title (str): The title to display on the SPLOM.

    Returns:
    - None

    Notes:
    - The SPLOM will display all pairwise relationships between the features specified in `label_value_dict`.
    - Requires the Plotly library to be installed.
    - The color mapping is predefined for specific categories ("Deep Blue", "Light Blue", "Purple", "Light Red", "Deep Red").
    - The `color` column should contain categorical values for proper color mapping.
    - The plot is interactive and allows for zooming, panning, and selecting points.
    - The plot has a draggable mode for selecting points, a specified width and height, and displays the title.

    Example:
    >>> createSPLOM(df, {"Feature1": "column1", "Feature2": "column2"}, "hover_text", "color_category", "SPLOM Title")
    """

    # Define indices corresponding to categories, using pandas label encoding
    fig = go.Figure(
        data=go.Splom(
            dimensions=[
                dict(label=key, values=inpdf[value])
                for key, value in label_value_dict.items()
            ],
            diagonal_visible=False,
            showupperhalf=False,
            text=inpdf[text],
            marker=dict(
                color=inpdf[color].map(
                    {
                        "Deep Blue": "navy",
                        "Light Blue": "blue",
                        "Purple": "fuchsia",
                        "Light Red": "red",
                        "Deep Red": "maroon",
                    }
                ),
                showscale=False,  # colors encode categorical variables
                line_color="white",
                line_width=0.5,
            ),
        )
    )

    fig.update_layout(
        title=title,
        dragmode="select",
        width=1200,
        height=1000,
        hovermode="closest",
    )

    fig.show()