# Evidently

In [57]:
import sys
sys.path.append('../')

from src.pipelines import main, predict, data
from config import config

import pandas as pd
import random
from sklearn.model_selection import train_test_split

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
from evidently.metric_preset import ClassificationPreset
from evidently.metrics import ClassificationQualityMetric, TextDescriptorsDriftMetric, ColumnDriftMetric

## Prepare data

In [58]:
artifacts = main.load_artifacts()

In [46]:
reference = pd.read_csv('../data/labeled_projects.csv')
reference = data.preprocess(reference, lower=artifacts["args"].lower, stem=artifacts["args"].stem, min_freq=artifacts["args"].min_freq)
reference = reference[['text', 'tag']]

valid = pd.read_csv('../data/test_labeled_projects.csv')
valid = data.preprocess(valid, lower=artifacts["args"].lower, stem=artifacts["args"].stem, min_freq=artifacts["args"].min_freq)
valid = valid[['text', 'tag']]

reference.shape, valid.shape

((655, 2), (300, 2))

In [47]:
reference.head()

Unnamed: 0,text,tag
0,comparison yolo rcnn real world videos bringin...,computer-vision
1,show infer tell contextual inference creative ...,computer-vision
2,awesome graph classification collection import...,other
3,awesome monte carlo tree search curated list m...,other
4,diffusion vector reference implementation diff...,other


In [48]:
reference_predict = predict.predict(reference["text"], artifacts)
reference['predictions'] = reference_predict
reference['predictions'] = reference['predictions'].apply(lambda x: x['predicted_tag'])
reference.head()

Unnamed: 0,text,tag,predictions
0,comparison yolo rcnn real world videos bringin...,computer-vision,computer-vision
1,show infer tell contextual inference creative ...,computer-vision,computer-vision
2,awesome graph classification collection import...,other,other
3,awesome monte carlo tree search curated list m...,other,other
4,diffusion vector reference implementation diff...,other,other


In [49]:
valid_predict = predict.predict(valid["text"], artifacts)
valid['predictions'] = valid_predict
valid['predictions'] = valid['predictions'].apply(lambda x: x['predicted_tag'])
valid.head()

Unnamed: 0,text,tag,predictions
0,easy ocr ready use ocr 40 languages supported ...,computer-vision,other
1,linear attention transformer fully featured tr...,natural-language-processing,natural-language-processing
2,emotion recognition tom jerry videos developed...,computer-vision,computer-vision
3,imagenette imagenette subset 10 easily classif...,computer-vision,computer-vision
4,textaugment improving short text classificatio...,natural-language-processing,natural-language-processing


## Classification model performance

In [50]:
# set up column mapping
column_mapping = ColumnMapping()

column_mapping.target = 'tag'
column_mapping.prediction = 'predictions'
column_mapping.text_features = ['text']

# list features so text field is not treated as a regular feature
column_mapping.numerical_features = []
column_mapping.categorical_features = []

In [51]:
performance_report = Report(metrics=[
    ClassificationQualityMetric()
])

performance_report.run(reference_data=reference, current_data=valid,
                        column_mapping=column_mapping)
performance_report

## Data drift

In [52]:
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package words to
[nltk_data]     /Users/macpro2014/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/macpro2014/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/macpro2014/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [53]:
data_drift_report = Report(
    metrics=[
        ColumnDriftMetric('tag'),
        ColumnDriftMetric('predictions'),
        TextDescriptorsDriftMetric(column_name='text'),
    ]
)

data_drift_report.run(reference_data=reference, 
                      current_data=valid, 
                      column_mapping=column_mapping)
data_drift_report