- In this notebook, we will explore some of the most commonly used functions in machine learning projects with [Google format](https://www.sphinx-doc.org/en/master/usage/extensions/example_google.html#example-google).

    - In this format, the docstring starts with a short summary of the function, followed by a more detailed description of the parameters and return values, if any.

    - This format allows for easier readability and understandability of the code and its purpose.

    - It's also easier to navigate and understand the code when you are working with other people on the same project.
    

- These functions will cover topics such as data preprocessing{ feature scaling, features selection, features importance}, model evaluation.


- The Notebook will be constantly updated and provided with more functions.

## For Data Exploration

In [None]:
def check_data(df,head=5):
    """
    Give you all information you need about your dataset
    
    Parameters:
    - df: Dataframe containing the data
    - head: The number of rows to appear
    
    Returns:
    - data info, sahpe, head, tail, missing values and descricption
    """
    print(20*"-" + "Information".center(20) + 20*"-")
    print(df.info())
    print(20*"-" + "Data Shape".center(20) + 20*"-")
    print(df.shape)
    print("\n" + 20*"-" + "The First 5 Data".center(20) + 20*"-")
    print(df.head())
    print("\n" + 20 * "-" + "The Last 5 Data".center(20) + 20 * "-")
    print(df.tail())
    print("\n" + 20 * "-" + "Missing Values".center(20) + 20 * "-")
    print(df.isnull().sum())
    print("\n" + 40 * "-" + "Describe the Data".center(40) + 40 * "-")
    print(df.describe([0.01, 0.05, 0.10, 0.50, 0.75, 0.90, 0.95, 0.99]).T)
    
    
#You can call this function by passing in your dataframe
check_data(df)

## Separate your data into categorical and quantitative columns

In [None]:
import numpy as np

def separate_columns(df):
    """
    Separate the categorical and quantitative columns from a given dataframe.
    
    Parameters:
    - df: Dataframe containing the data
    
    Returns:
    - Tuple containing two lists, one for categorical and one for quantitative columns
    """
    categorical_columns = []
    quantitative_columns = []
    
    for col in df.columns:
        if df[col].dtype == object:
            categorical_columns.append(col)
        elif np.issubdtype(df[col].dtype, np.number):
            quantitative_columns.append(col)
            
    return categorical_columns, quantitative_columns

# You can call this function by passing in your dataframe like this
cat_cols, quan_cols = separate_columns(df)

## Detect outliers and fix them through a single function

In [None]:
import pandas as pd
import numpy as np

def detect_and_fix_outliers(df, columns=None, fix=True):
    """
    Detect and fix (if specified) outliers in the quantitative columns of a given dataframe.
    
    Parameters:
    - df: Dataframe containing the data
    - columns: List of quantitative columns to check for outliers. If None, all quantitative columns will be checked.
    - fix: Boolean value indicating whether or not to fix the outliers
    
    Returns:
    - Dataframe with outliers fixed
    """
    if columns is None:
        # Separate quantitative columns if not provided
        _, columns = separate_columns(df)
    
    for col in columns:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        if fix:
            # replace the outliers with the lower/upper bound with the median of the column
            df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = np.nan
            df[col].fillna(df[col].median(), inplace=True)
        else:
            # Mark the outliers by adding new column to the dataframe with a postfix '_Outlier' and fill it with 1 for the rows that contain outliers.
            df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col+'_Outlier'] = 1
    return df

## Scale your numerical columns

In [None]:
from sklearn.preprocessing import StandardScaler

def standardize_data(df, columns=None):
    """
    Standardize the numerical features of a given dataset.
    
    Parameters:
    - df: Dataframe containing the data
    - columns: List of numerical columns to standardize. If None, all numerical columns will be standardized.
    
    Returns:
    - Dataframe with standardized numerical columns
    """
    if columns is None:
        # Separate numerical columns if not provided
        _, columns = separate_columns(df)
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

## Encode the categorical features.
##### (OneHotEncoder or LabelEncoder)
    

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_features(df, categorical_features):
    """
    One-hot encode the specified categorical features in the dataframe.
    
    Parameters:
    - df: Dataframe containing the data
    - categorical_features: List of column names for the categorical features to encode
    
    Returns:
    - Dataframe with the categorical features one-hot encoded
    """
    # create an instance of the one-hot encoder
    one_hot = OneHotEncoder()

    # fit and transform the one-hot encoder using the categorical feature data
    one_hot_encoded = one_hot.fit_transform(df[categorical_features])

    # create new dataframe with the one-hot encoded columns
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot.get_feature_names(categorical_features))
    
    # add the one-hot encoded columns to the original dataframe
    df = pd.concat([df, one_hot_encoded_df], axis=1)
    
    # drop the original categorical features
    df = df.drop(columns=categorical_features)
    
    return df

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def label_encode_features(df, categorical_features):
    """
    Label encode the specified categorical features in the dataframe.
    
    Parameters:
    - df: Dataframe containing the data
    - categorical_features: List of column names for the categorical features to encode
    
    Returns:
    - Dataframe with the categorical features label encoded
    """
    # create an instance of the label encoder
    label_encoder = LabelEncoder()

    # fit and transform the label encoder using the categorical feature data
    for feature in categorical_features:
        df[feature] = label_encoder.fit_transform(df[feature])
    
    return df

## Split the dataframe into:
### [training and testing] or [training, validation and testing] sets

In [None]:
from sklearn.model_selection import train_test_split

def split_data(df, target, test_size=0.2, random_state=42):
    """
    Split the dataframe into training and testing sets.
    
    Parameters:
    - df: Dataframe containing the data
    - target: Name of the target column
    - test_size: Proportion of data to use for testing
    - random_state: Seed for the random number generator
    
    Returns:
    - Tuple containing the training and testing sets
    """
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
from sklearn.model_selection import train_test_split

def split_data(df, target, test_size=0.2, valid_size=0.1, random_state=42):
    """
    Split the dataframe into training, validation and testing sets.
    
    Parameters:
    - df: Dataframe containing the data
    - target: Name of the target column
    - test_size: Proportion of data to use for testing
    - valid_size: Proportion of data to use for validation
    - random_state: Seed for the random number generator
    
    Returns:
    - Tuple containing the training, validation and testing sets
    """
    X = df.drop(target, axis=1)
    y = df[target]
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=valid_size, random_state=random_state)
    return X_train, X_valid, X_test, y_train, y_valid, y_test

## Plot the feature importances of a given model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_importance(model, features, modelName, num=None, save_path=None, metric=None):
    """
    Plot the feature importances of a given model
    
    Parameters:
    model: The trained model to extract the feature importances from
    features: The feature dataset used to train the model
    modelName: Name of the model
    num: Number of top features to plot, default is None
    save_path: The path to save the image file of the plot, default is None
    metric: The metric to use to extract feature importances, default is None
    
    Returns:
    None
    """
    if metric is None:
        feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    else:
        feature_imp = pd.DataFrame({'Value': model.named_steps[metric].scores_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    if num is None:
        num = len(features.columns)
    if save_path is not None:
        sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",ascending=False)[0:num], orient='h')
        plt.title('Features'+ ' - ' + modelName.__name__ )
        plt.tight_layout()
        plt.savefig(save_path)
    else:
        sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",ascending=False)[0:num], orient='h')
        plt.title('Features'+ ' - ' + modelName.__name__ )
        plt.tight_layout()
        plt.show()

## Measure the accuracy of a model
[Classification and Regression models]


In [None]:
def measure_model_accuracy(model: object, X_test: object, y_test: object) -> None:
    """
    Measure the accuracy of a model using a classification report and a confusion matrix with a heatmap.

    Parameters:
    model (object): The trained model that you want to measure the accuracy of
    X_test (object): The test data set that you want to use to evaluate your model
    y_test (object): The true labels for the test data set

    Returns: None
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import classification_report, confusion_matrix

    # Get predictions
    y_pred = model.predict(X_test)
    # Get classification report
    report = classification_report(y_test, y_pred)
    print(report)
    # Get confusion matrix
    matrix = confusion_matrix(y_test, y_pred)
    # Plot heatmap of confusion matrix
    sns.heatmap(matrix, annot=True, cmap='Blues', xticklabels=True, yticklabels=True)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def measure_model_accuracy(model, X_test, y_test):
    """
    Measure the accuracy of a model using MAE and MSE.

    Parameters:
    model (object): The trained model
    X_test (object): The test data set
    y_test (object): The true labels for the test data set

    Returns: None
    """
    # Get predictions
    y_pred = model.predict(X_test)
    # Get MAE
    mae = mean_absolute_error(y_test, y_pred)
    print(f'MAE: {mae}')
    # Get MSE
    mse = mean_squared_error(y_test, y_pred)
    print(f'MSE: {mse}')

## As I said previously this Notebook will be constantly updated and provided with more functions.. so stay tuned for other versions..