<a href="https://colab.research.google.com/github/ariahosseini/TradML/blob/main/MachineLearningUtils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys, warnings, itertools
import pandas as pd
import numpy as np
from scipy.stats import linregress
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

In [None]:
def summarize_columns(df):
    num_rows = len(df)
    summary = pd.DataFrame(df.dtypes, columns=["dtypes"])
    summary = summary.reset_index()
    summary["col_name"] = summary["index"]
    summary = summary[["col_name", "dtypes"]]
    summary["missing"] = df.isnull().sum().values
    summary["missing_percent"] = summary["missing"].apply(lambda x: round(x*100/num_rows, 1))
    summary["uniques"] = df.nunique().values
    summary["first_value"] = df.iloc[0].values
    summary["second_value"] = df.iloc[1].values
    summary["third_value"] = df.iloc[2].values
    return summary

In [None]:
def reduce_memory_usage(df, category = False):
    start_mem = df.memory_usage().sum() / (1024**2)
    print("Memory usage of dataframe is {:2f} MB!".format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if category:
                df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / (1024**2)
    print("Memory usage after optimization is {:2f} MB!".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem-end_mem) / start_mem))

In [None]:
def display_df(df, message = " "):
    print("Dataframe: {}".format(message))
    num_rows = len(df)
    num_cols = len(df.columns)
    print("num_rows = {:,} \nnum_cols = {:,}".format(num_rows, num_cols))
    display(df.head())
    print("Info:")
    print(df.info())
    df_null = df.isnull().sum()
    if len(df_null)==0:
        print("Number of null data points:",df_null[df_null!=0])
    else:
        print("Number of null data points:")
        print(df_null[df_null!=0])

In [None]:
def plot_varibles(df, vars_to_plot, cts_vars, num_cols=2, hist_num_bins=20):
    num_rows = (len(vars_to_plot) // num_cols)
    if (len(vars_to_plot) % num_cols) != 0:
        num_rows += 1
    fig_size_with_subplots = (num_cols*5, num_rows*5)
    fig = plt.figure(figsize=fig_size_with_subplots)
    fig_dims = (num_rows, num_cols)
    row, col = 0, 0
    for var in vars_to_plot:
        plt.subplot2grid(fig_dims, (row, col))
        if var in cts_vars:
            df[var].hist(bins=int(hist_num_bins), xrot=45)
            plt.title("{} Histogram".format(var))
        else:
            df[var].value_counts().plot(kind="bar", title="{} Counts".format(var))
        col += 1
        if col % num_cols == 0:
            row += 1
            col = 0

In [None]:
def plot_regs(df, cts_vars, response, num_cols=3, dot_size=10, line_width=3):
    num_rows = (len(cts_vars) // num_cols)
    if (len(cts_vars) % num_cols) != 0:
        num_rows += 1
    fig_size_with_subplots = (num_cols*5, num_rows*5)
    fig = plt.figure(figsize=fig_size_with_subplots)
    fig, ax = plt.subplots(num_rows, num_cols, figsize=fig_size_with_subplots)
    row, col = 0, 0
    for var in cts_vars:
        slope, intercept, r_value, p_value, std_error = linregress(df[var], df[response])
        sns.regplot(x=df[var], y=df[response], ax=ax[row, col],
                   scatter_kws={"s": dot_size},
                   line_kws={"linewidth": line_width},
                   label="y={0:.1f}x+{1:.1f}".format(slope, intercept)).legend(loc="best")
        col += 1
        if col % num_cols == 0:
            row += 1
            col = 0
#     fig.suptitle("Regression Scatter Plots for {}".format(response), fontsize=16)
    fig.subplots_adjust(top=0.95)
    plt.tight_layout()
    plt.show()

In [None]:
@staticmethod
def display_side_by_side(dfs: list, captions: list, table_spacing=5):
    output = ""
    for (caption, df) in zip(captions, dfs):
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += table_spacing * "\xa0"
    display(HTML(output))

In [None]:
def one_hot_encode(df, ohe, var_list, drop_original=True):
    temp_df = pd.DataFrame(data=ohe.transform(df[var_list]), columns=ohe.get_feature_names_out())
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    if drop_original:
        df.drop(columns=var_list, axis=1, inplace=True)
    return df

In [None]:
def plot_conf_matrix(cm, classes, normalize=False, title="Confusion matrix", cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype("float")/ cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix no normalization")
    print(cm)
    thresh = cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel("True label")
    plt.xlabel("Predicted label")