# Import Libraries

In [89]:
import numpy as np
import pandas as pd
import seaborn as sns

# Read Sample Data

In [90]:
df = sns.load_dataset('iris')

In [91]:
df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


# Benchmark

In [92]:
def summary_statistic(df) -> pd.DataFrame:

        """
        function input: it get a pandas dataframe as a csv file
        output: return: retuen a information about columns as pandas dataframe
        """



        feature_describe = df.describe().T.reset_index().rename(
                               columns={'index':'feature'}).drop(columns='count')

        feature_info = pd.concat([df.dtypes,
                                   df.nunique(),
                                   df.isna().sum(),
                                   df.count()], axis=1,
                    keys=['type', 'count_unique', 'count_nan', 'count']).reset_index().rename(columns={'index':'feature'})

        summary_statistic_result = feature_info.merge(feature_describe, how='left', on='feature')

        return summary_statistic_result

In [93]:
df_s = summary_statistic(df)

In [94]:
num_benchmark = df_s[df_s['type'] != 'object'][['feature','mean']].set_index('feature').T.to_dict()

In [95]:
num_benchmark

{'sepal_length': {'mean': 5.843333333333335},
 'sepal_width': {'mean': 3.057333333333334},
 'petal_length': {'mean': 3.7580000000000027},
 'petal_width': {'mean': 1.199333333333334}}

In [96]:
cat_benchmark = df_s[df_s['type'] == 'object'][['feature','count_unique']].set_index('feature').T.to_dict()

In [97]:
cat_benchmark

{'species': {'count_unique': 3}}

# Validate Class

In [110]:
class ValidateFeature:
    
    def __init__(self,product):
        
        self.num_benchmark = {
                                'sepal_length': {'mean': 5.843333333333335},
                                'sepal_width': {'mean': 3.057333333333334},
                                'petal_length': {'mean': 3.7580000000000027},
                                'petal_width': {'mean': 1.199333333333334}
                              }
        
        self.cat_benchmark = {
                                'species': {'count_unique': 3},
                              }
        self.product = product
    def validate_selective_feature(self,product,is_numerical):
    
        if is_numerical == True:
            product = product.select_dtypes(exclude=['object'])
            try:
                for col in product.columns:
                    if col in self.num_benchmark.keys():
                        acceptance_limit_lower = self.num_benchmark[col]['mean']\
                            - self.num_benchmark[col]['mean']*25/100
                        acceptance_limit_upper = self.num_benchmark[col]['mean']\
                            + self.num_benchmark[col]['mean']*25/100
                        assert acceptance_limit_lower < product[col].mean() < acceptance_limit_upper
                        print(f' "{col}" passed!')
                    else:
                        print(f' "{col}" is new feture!')
            except:
                print(f' "{col}" failed!')
        elif is_numerical == False:
            product = product.select_dtypes(include=['object'])
            try:
                for col in product.columns:
                    if col in self.cat_benchmark.keys():
                        acceptance_limit_lower = self.cat_benchmark[col]['count_unique']\
                            - self.cat_benchmark[col]['count_unique']*5/100
                        acceptance_limit_upper = self.cat_benchmark[col]['count_unique']\
                            + self.cat_benchmark[col]['count_unique']*5/100
                        assert acceptance_limit_lower < product[col].nunique() < acceptance_limit_upper
                        print(f' "{col}" passed!')
                    else:
                        print(f' "{col}" is new feture!')
            except:
                print(f' "{col}" failed!')
                
    def validate_feature(self):
        print('_'*25 + 'Numerical Variable'+ '_'*25 + '\n')
        self.validate_selective_feature(self.product,is_numerical=True)
        print('_'*25 + 'Categorical Variable'+ '_'*25 + '\n')
        self.validate_selective_feature(self.product,is_numerical=False)   

In [111]:
o = ValidateFeature(df)

In [112]:
o.validate_feature()

_________________________Numerical Variable_________________________

 "sepal_length" passed!
 "sepal_width" passed!
 "petal_length" passed!
 "petal_width" passed!
_________________________Categorical Variable_________________________

 "species" passed!
