In [1]:
# Importing essential data analysis libraries
import pandas as pd
import numpy as np


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [1]:
class DataPrepKit:
    """
    DataPrepKit - A class for data preparation tasks.

    This class encapsulates functionality for common data preparation tasks, including reading data from various file formats.

    Attributes:
    - data (pd.DataFrame): The main data container.

    Methods:
    - __init__(self, data): Initializes the DataPrepKit instance with a provided DataFrame.
    - read_data(self, file_path, file_format): Reads data from the specified file and updates the 'data' attribute.

    Usage:
    >>> prep_kit = DataPrepKit(data)
    >>> prep_kit.read_data('example.csv', 'csv')

    
    """
    def __init__(self, data):
        """
        Initializes the DataPrepKit instance.

        Parameters:
        - data (pd.DataFrame): The initial DataFrame to work with.
        """
        self.data = data

    def read_data(self, file_path, file_format):
        """
        Reads data from a file and updates the 'data' attribute.

        Parameters:
        - file_path (str): The path to the data file.
        - file_format (str): The format of the data file ('csv', 'excel', or 'json').

        Usage:
        >>> prep_kit.read_data('example.csv', 'csv')
        """
        if file_format == 'csv':
            self.data = pd.read_csv(file_path)
        elif file_format == 'excel':
            self.data = pd.read_excel(file_path)
        elif file_format == 'json':
            self.data = pd.read_json(file_path)
        else:
            print("Invalid file format.")


In [12]:
def data_summary(self):
    """
    Generate a summary of the data.

    This method computes various statistical summaries and information about the DataFrame, providing insights into its structure and content.

    Output:
    - Average: Mean values for each column.
    - Most frequent values: The mode of each column.
    - Describe: Descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max).
    - Head: The first few rows of the DataFrame.
    - Tail: The last few rows of the DataFrame.
    - Info: A concise summary of the DataFrame's columns.
    - Missing Values: The count of missing values in each column.
    - Unique Values: The count of unique values in each column.

    Usage:
    >>> prep_kit = DataPrepKit(data)
    >>> prep_kit.data_summary()

    
    """
    print("Data Summary")
    print("==========================")
    summary = {
        'Average': self.data.mean(),
        'Most frequent values': self.data.mode().iloc[0],
        'Describe': self.data.describe(),
        'Head': self.data.head(),
        'Tail': self.data.tail(),
        'Info': self.data.info(),
        'Missing Values': self.data.isnull().sum(),
        'Unique Values': self.data.nunique()
    }
    for key, value in summary.items():
        print(key)
        print("===============")
        print(value)
    print(pd.DataFrame(summary))


In [3]:
def drop_duplicates(self):
    """
    Remove duplicate rows from the DataFrame.

    This method removes duplicate rows based on all columns, keeping the first occurrence of each duplicated row and modifying the DataFrame in place.

    Usage:
    >>> prep_kit = DataPrepKit(data)
    >>> prep_kit.drop_duplicates()

    
    """
    self.data.drop_duplicates(inplace=True)


In [4]:
def handle_missing_values(data, handling_Type):
    """
    Handle missing values in a DataFrame based on the specified strategy.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing missing values.
    - handling_Type (str): The strategy for handling missing values ('mean', 'median', or 'drop').

    Returns:
    - pd.DataFrame: A DataFrame with missing values handled according to the specified strategy.

    Raises:
    - ValueError: If an unsupported missing value handling strategy is provided.

    Usage:
    >>> cleaned_data = handle_missing_values(input_data, 'mean')

    
    """
    df = pd.DataFrame(data)
    if handling_Type == 'mean':
        return df.fillna(data.mean())
    elif handling_Type == 'median':
        return df.fillna(data.median())
    elif handling_Type == 'drop':
        return df.dropna()
    else:
        raise ValueError("Unsupported missing value handling strategy")


In [5]:
def encode_categorical_data(self):
    """
    Encode categorical data in the DataFrame using one-hot encoding.

    This method identifies columns with object (categorical) data types, applies one-hot encoding, and replaces the original DataFrame with the encoded version.

    Usage:
    >>> prep_kit = DataPrepKit(data)
    >>> prep_kit.encode_categorical_data()

    
    """
    encoded_data = pd.get_dummies(
        self.data, columns=self.data.select_dtypes(include='object').columns)
    self.data = encoded_data
