In [None]:
#this makes a basic chart of basic stats about the numerical features a data set. 
#column names and description:
#        "Feature": column name
#        "# NaN": number of nulls in the column
#        "% NaN": percent of the column that are null values
#        "% Zero": percent of the column that are zeroes
#        "Dtype": column's dtype
#        "# Count": number of values in the column
#        "# Unique": number of unique values in the column
#        "# Dupes": number of duplicate values in the column
#        "Min": lowest value in the column
#        "Max": highest value in the column    
#        "Spread": diff between min and max
#        "Mean": average of the column
#        "Median": median of the column
#        "Std Dev": standard deviation of the column
#        "Kurtosis (3)": kurtosis of the column
#        "Skew (0)": skewness of the column

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:
#gets a list of the files in the folder; from the output, copy the region and income data sets only for making a list below
folder_loc = 'F:/class/BANA 698/week 4/rawdata is 20 years/with qpsd, no 2024/'
output_prefix = 'basic_stats_'
import os
file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
for file in file_list:
    print(file)

In [4]:
file = 'Group1Data_20years_lowermiddleincome.NoNaNColumns.csv'
df = pd.read_csv(folder_loc + file)

In [5]:
sortstyle='% NaN' #sort by which column for the display?
show_table = False #display the table in the interpreter
rnd_lvl=3 #round decimals to this number

results = []

for column in df:
    dtype = str(df[column].dtype)
    unique_count = df[column].nunique()
    
    noNan_count = df[column].count()
    Nan_count = df[column].isna().sum()
    dupe_counts = df[column].value_counts()
    dupes = dupe_counts[dupe_counts > 1]
    num_dupes = dupes.count()
    
    percent_nan = round(df[column].isna().mean() * 100, rnd_lvl)
    percent_zero = round((df[column] == 0).mean() * 100, rnd_lvl)
    
    min_value = "."
    max_value = "."
    mean_value = "."
    median_value = "."
    std_dev = "."
    kurtosis = "."
    skewness = "."

    if pd.api.types.is_numeric_dtype(df[column]):
        min_value = round(df[column].min(), rnd_lvl)
        max_value = round(df[column].max(), rnd_lvl)
        spread = max_value - min_value
        mean_value = round(df[column].mean(), rnd_lvl)
        median_value = round(df[column].median(), rnd_lvl)
        std_dev = round(df[column].std(), rnd_lvl)
        kurtosis = round(df[column].kurtosis(), rnd_lvl)
        skewness = round(df[column].skew(), rnd_lvl)
        if pd.isna(kurtosis) or not isinstance(kurtosis, (int, float)): #bc if there's too many nans, kurtosis returns a string
            kurtosis = "."
        if pd.isna(skewness) or not isinstance(skewness, (int, float)): #same for skew
            skewness = "."

        results.append({
            "Feature": column,
            "# NaN": Nan_count,
            "% NaN": percent_nan,
            "% Zero": percent_zero,
            "Dtype": dtype,
            "# Count": noNan_count,     
            "# Unique": unique_count,
            "# Dupes": num_dupes,
            "Min": min_value,
            "Max": max_value,
            "Spread": spread,
            "Mean": mean_value,
            "Median": median_value,
            "Std Dev": std_dev,
            "Kurtosis (3)": kurtosis,
            "Skew (0)": skewness
        })

results_df = pd.DataFrame(results)

if sortstyle == 'Feature':
    results_df = results_df.sort_values(by=sortstyle)
elif sortstyle == 'Dtype':
    results_df = results_df.sort_values(by=[sortstyle, 'Feature'], ascending =[False, True])
else:
    results_df = results_df.sort_values(by=[sortstyle, 'Feature'])
    
pd.options.display.float_format = f'{{:.{rnd_lvl}f}}'.format

if show_table: display(results_df)

results_df.to_csv(folder_loc + output_prefix + file, index=False)