In [1]:
%pwd

'd:\\PycharmProjects\\ml-app\\research'

In [2]:
import os
os.chdir("..")

In [3]:
from mlengine.config.settings import settings
from mlengine.common.logger import logger

[32m2023-11-05 02:16:55[0m [35mDESKTOP-PEC6JQ0[0m [34m11_05_2023_02_16_55.log[24720][0m [1;30mINFO[0m [32mLogging initialized.[0m
[32m2023-11-05 02:16:55[0m [35mDESKTOP-PEC6JQ0[0m [34m11_05_2023_02_16_55.log[24720][0m [1;30mINFO[0m [32mSettings file [d:\PycharmProjects\ml-app\src\mlengine\config\settings.yaml] loaded successfully.[0m


In [4]:
import pandas as pd
import pydantic

In [5]:
data_validation_settings = settings.data_validation

In [6]:
df = pd.read_csv(data_validation_settings.data_file, delimiter=',')

In [7]:
class DataValidator(pydantic.BaseModel):
    gender: pydantic.StrictStr
    race_ethnicity: pydantic.StrictStr
    parental_level_of_education: pydantic.StrictStr
    lunch: pydantic.StrictStr
    test_preparation_course: pydantic.StrictStr
    math_score: pydantic.StrictInt
    reading_score: pydantic.StrictInt
    writing_score: pydantic.StrictInt

In [8]:
data_list = [DataValidator(**row) for _, row in df.iterrows()]

In [9]:
df.isna().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [12]:
print(df.nunique())

gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64


In [13]:
df.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [14]:
df['total score'] = df['math_score'] + df['reading_score'] + df['writing_score']
df['average'] = df['total score']/3
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [15]:
import plotly.express as px
from collections import Counter
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [16]:
def group_below_top_n(count_df, n, text):
    sorted_df = count_df.sort_values(by=text, ascending=False)
    top_n = sorted_df.iloc[:n]
    others_sum = sorted_df.iloc[n:].sum()
    others = pd.DataFrame({text: [others_sum]}, index=['Others'])
    pie_df = pd.concat([top_n, others], ignore_index=False)
    return pie_df

In [17]:
def create_pie_plot(column_data, others=5):
    text = 'Occurances'
    count_df = pd.DataFrame.from_dict(Counter(column_data), orient='index', columns=[text])

    if len(count_df) > others:
        pie_df = group_below_top_n(count_df, others, text)
    else:
        pie_df = count_df

    fig = px.pie(pie_df, values=text, names=list(pie_df.index), hover_data=[text],
                 labels={text: text})

    fig.update_traces(textinfo='percent+label')

    return fig


In [18]:
create_pie_plot(df['race_ethnicity'])

In [19]:
def create_histogram_plot(column_data):
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(go.Histogram(x=column_data), row=1, col=1)
    fig.update_traces(name=column_data.name)

    return fig


In [20]:
def is_numeric(df):
    if isinstance(df, pd.DataFrame):
        return df.apply(pd.api.types.is_numeric_dtype)
    elif isinstance(df, pd.Series):
        return pd.api.types.is_numeric_dtype(df)

In [21]:
def feature_desc_hist_array(df, opt_cat='pie'):
    plots_array = []
    titles_array = []

    for column in df.columns:
        if not is_numeric(df[column]) and df[column].nunique() == df[column].size:
            plot = create_histogram_plot(df[column])
            titles_array.append(f'{df[column].size} unique values.')
        elif is_numeric(df[column]):
            plot = create_histogram_plot(df[column])
            titles_array.append(f'Histogram of {column}.')
        else:
            if opt_cat == 'pie':
                plot = create_pie_plot(df[column])
            else:
                plot = create_histogram_plot(df[column])
            titles_array.append(f'Pie chart of {column}.')

        plots_array.append(plot)

    # Determine the number of rows and columns in the grid
    num_rows = 5
    num_cols = len(plots_array) // num_rows + (len(plots_array) % num_rows > 0)

    subplot_types = ['pie' if plot.data and isinstance(plot.data[0], go.Pie) else 'xy' for plot in plots_array]

    combined_fig = make_subplots(
        rows=num_rows,
        cols=num_cols,
        specs=[[{"type": subplot_types[i * num_cols + j]} if i * num_cols + j < len(subplot_types) else {"type": "xy"} for j in range(num_cols)] for i in range(num_rows)],
        subplot_titles=titles_array
    )

    # Add plots to subplots
    for i, plot in enumerate(plots_array):
        col_num = (i % num_cols) + 1
        row_num = (i // num_cols) + 1
        for trace in plot.data:
            combined_fig.add_trace(trace, row=row_num, col=col_num)

    # Update layout and show the combined figure
    
    combined_fig.update_layout(dict(settings.plot_layouts.features_plots_layout))
    combined_fig.show()

In [23]:
feature_desc_hist_array(df)