In [1]:
import datetime
import time
import random
import logging 
import pandas as pd
import psycopg
import joblib

from prefect import task, flow

from evidently.report import Report
from evidently import ColumnMapping
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

In [36]:
def convert_to_category(df,cat_features):
    for c in cat_features:
        df[c] = df[c].astype('category')	

    return df

In [45]:
raw_data = pd.read_csv('../data/YouTubeDataset_withChannelElapsed.csv')
features =['index','channelId','videoCategoryId', 'channelViewCount',
                    'videoCount', 'subscriberCount', 'videoId','channelelapsedtime',
                    'channelCommentCount', 'videoViewCount','elapsedtime', 
                    'videoDislikeCount','videoPublished', 'VideoCommentCount','videoLikeCount']
cat_features = ['channelId', 'videoId','videoPublished']
    
raw_data = raw_data[features]
trained_data = raw_data

# i will filter the training set for a week as well cause if I compare the whole training set
# to one week of new data then the report takes a really long time to run
trained_data = trained_data[(trained_data['videoPublished']>='2015-09-21T17:54:24.000Z') & (trained_data['videoPublished']<='2015-09-28T17:54:24.000Z')]

trained_data = convert_to_category(trained_data,cat_features)

with open('lightgbm_reg.bin', 'rb') as f_in:
    model = joblib.load(f_in)

#since I have no new data, I will just assume that the last week of the training data is new data

raw_data['videoPublished'] = raw_data['videoPublished'].astype(object)
new_data = raw_data[(raw_data['videoPublished']>='2015-09-29T17:54:24.000Z') & (raw_data['videoPublished']<='2015-10-05T17:54:24.000Z')]
new_data.drop(['videoLikeCount'],axis=1,inplace=True)

new_data = convert_to_category(new_data,cat_features)


In [47]:
new_data['videoLikeCount'] = model.predict(new_data)

In [48]:
num_features = list(set(features)-set(cat_features))
column_mapping = ColumnMapping(
    prediction='videoLikeCount',
    numerical_features=num_features,
    categorical_features=cat_features,
    target=None
)


In [49]:
report = Report(metrics = [
    ColumnDriftMetric(column_name='videoLikeCount'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
])

In [50]:
report.run(reference_data = trained_data, current_data = new_data,
	column_mapping=column_mapping)

  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp


In [51]:
report.as_dict()

{'metrics': [{'metric': 'ColumnDriftMetric',
   'result': {'column_name': 'videoLikeCount',
    'column_type': 'num',
    'stattest_name': 'K-S p_value',
    'stattest_threshold': 0.05,
    'drift_score': 0.1120879120879121,
    'drift_detected': False,
    'current': {'small_distribution': {'x': [28.30539200567202,
       37.60010528641206,
       46.89481856715209,
       56.18953184789213,
       65.48424512863217,
       74.77895840937221,
       84.07367169011223,
       93.36838497085228,
       102.66309825159232,
       111.95781153233236,
       121.2525248130724],
      'y': [0.05379402084796641,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.026897010423983205,
       0.0,
       0.0,
       0.026897010423983205]}},
    'reference': {'small_distribution': {'x': [8.0,
       446.6,
       885.2,
       1323.8000000000002,
       1762.4,
       2201.0,
       2639.6000000000004,
       3078.2000000000003,
       3516.8,
       3955.4,
       4394.0],
    