In [1]:
from validation.unit import data_assumptions, model_metrics

from validation.integration import (
   find_outliers, find_duplicates, find_balance, find_correlation
   ,validate_dataset, test_representative 
)


In [2]:
import pandas as pd
import numpy as  np

import warnings
warnings.filterwarnings('ignore')

# Demonstrate Functions 

## Integration Testing

### Collect Sample Data

In [3]:
test_1 = pd.read_csv("test_repos/linear-regression-example/data/Advertising_data.csv")

In [4]:
test_1.head(2)

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,2210
1,44.5,39.3,45.1,1040


In [5]:
data = {'col1': ['a', 'b', 'c', 'd', 'e', 'a','a','a','f'],
        'col2': [1, 2, 3, 4, 5, 1, 1, 1, 2]}
test_2 = pd.DataFrame(data)

In [6]:
test_2.head(2)

Unnamed: 0,col1,col2
0,a,1
1,b,2


### Integration Functions

    Checking for outliers in a categorical column

In [7]:
find_outliers(test_2, categorical_threshold=0.15)

Unnamed: 0,Column,Value,Index,Issue
0,col1,b,1,Outlier
1,col1,c,2,Outlier
2,col1,d,3,Outlier
3,col1,e,4,Outlier
4,col1,f,8,Outlier


    Checking for outliers in numerical data

In [8]:
test_3 = pd.concat([test_1, pd.DataFrame([[2000, 1.2, 0, 50]],columns=["TV","radio","newspaper","sales"])])

In [9]:
find_outliers(test_3)

Unnamed: 0,Column,Value,Index,Issue
0,TV,2000.0,0,Outlier


    Checking for duplicates

In [10]:
find_duplicates(test_2)

Unnamed: 0,Column,Value,Index,Issue
0,['col1' 'col2'],,5,Duplicates
1,['col1' 'col2'],,6,Duplicates
2,['col1' 'col2'],,7,Duplicates


    Check for imbalance

In [11]:
find_balance(test_2, label_field="col2")

Unnamed: 0,col2,label_count,label_percent,imbalance_level
0,1,4,44.444444,
1,2,2,22.222222,Mild
2,3,1,11.111111,Moderate
3,4,1,11.111111,Moderate
4,5,1,11.111111,Moderate


    find_correlation

In [12]:
find_correlation(test_1, corr_method="pearson", threshold = 0.75)

Unnamed: 0,Attribute_1,Attribute_2,Absolute_Correlation
3,TV,sales,0.782224
12,sales,TV,0.782224


    Check if representative

In [13]:
test_representative(test_1, test_1.sample(50))

Unnamed: 0,Feature,P_Value,Issue
0,TV,0.006481,Not Representative of Dataset
0,radio,0.0,Not Representative of Dataset
0,newspaper,0.0,Not Representative of Dataset
0,sales,1.481648,Representative


    Wrapper

In [14]:
validate_dataset(test_2, categorical_threshold=0.15, label_field="col2", corr_method="pearson"
                 , threshold = 0.75, data_type="df")

{'outliers':   Column Value Index    Issue
 0   col1     b     1  Outlier
 1   col1     c     2  Outlier
 2   col1     d     3  Outlier
 3   col1     e     4  Outlier
 4   col1     f     8  Outlier,
 'duplicates':             Column Value  Index       Issue
 0  ['col1' 'col2']   N/A      5  Duplicates
 1  ['col1' 'col2']   N/A      6  Duplicates
 2  ['col1' 'col2']   N/A      7  Duplicates,
 'balance':    col2  label_count  label_percent imbalance_level
 0     1            4      44.444444            None
 1     2            2      22.222222            Mild
 2     3            1      11.111111        Moderate
 3     4            1      11.111111        Moderate
 4     5            1      11.111111        Moderate,
 'correlation': Empty DataFrame
 Columns: [Attribute_1, Attribute_2, Absolute_Correlation]
 Index: []}

## Unit Testing

### Data Assumptions

In [15]:
data_assumptions(test_1)

There are no null values in the data.
There are no data type mismatches


Unnamed: 0,index,null_count,Data_Types
0,TV,0.0,0
1,radio,0.0,0
2,newspaper,0.0,0
3,sales,0.0,0


    Add some nulls and data mismatches to our sample data

In [16]:
test_4 = pd.concat([test_1, pd.DataFrame([[None, "37.2", 53, None]],columns=["TV","radio","newspaper","sales"])])

In [17]:
data_assumptions(test_4)

Unnamed: 0,index,null_count,Data_Types
0,TV,1.0,"[<class 'float'>, <class 'NoneType'>]"
1,radio,0.0,"[<class 'float'>, <class 'str'>]"
2,newspaper,0.0,0
3,sales,1.0,"[<class 'int'>, <class 'NoneType'>]"


### Model Accuracy

In [18]:
model_metrics([1, 0, 1, 1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 1, 0, 1, 0, 1], threshold=[.90,.80,.80,.80], regression=False)

Testing for Binary Classification Metrics


Unnamed: 0,Metric,Value,Threshold,Passed
0,Accuracy,0.666667,0.9,False
1,Precision,0.666667,0.8,False
2,Recall,0.8,0.8,True
3,F1 Score,0.727273,0.8,False


In [19]:
model_metrics([5, 4, 3, 2, 2, 3, 4, 5, 6], [4.2, 4, 3.4, 1, 0, 4.5, 4.2, 5.1, 6.5], threshold=[1,1,1,30], regression=True)

Testing for Regression Metrics


Unnamed: 0,Metric,Value,Threshold,Passed
0,Mean Squared Error,0.927778,1,True
1,Root Mean Squared Error,0.963212,1,True
2,Mean Absolute Error,0.722222,1,True
3,Mean Absolute Percentage Error,27.185185,30,True
