# Statistics

## n-sigma outliers with log

In [3]:
import pandas as pd

def nsigma_outliers(df: pd.DataFrame(), ls_columns: [], nsigma: int, output_file: str()) -> pd.DataFrame():
    """
    Detects outliers within nsigma away from mean

    Parameters
    ----------
        df:
            pd.DataFrame()
        ls_columns:
            list of columns to be checked for outliers; if no 
            argument given it will take the entire columns of df
        nsigma: 
            number of standard deviations from mean allowed
        output_file:
            a string containing the address to save the outlier stats file. 
            if left null nothing will be written
    
    Returns
    ---------
        pd.DataFrame:
            a cleaned dataframe excluding the outliers
        status
    """
    # print("checking outliers")
    outlier_stat = pd.Series()
    if (ls_columns[0] == "CheckAllColumns"):
        ls_columns = df.columns
    
    for i in (ls_columns):
        try:
            if (df[i].std() != 0):
                tmp = (df[i] - df[i].mean()) / df[i].std()
            else:
                tmp = df[i] - df[i] # zeroing all the values for the zero std column to avoid outlier
            df[i] = df[i][abs(tmp)  < nsigma] # filtering the outliers
            outlier_stat[i] = round((tmp[abs(tmp) > nsigma]).count() / tmp.shape[0] * 100, 1)
        except:
            print("no outlier checked for: ", i)
    outlier_stat = outlier_stat.sort_values(ascending = False, na_position='last')
    if output_file:
        outlier_stat = outlier_stat.T
        outlier_stat.to_csv(output_file) # 
    return df