In [None]:
from surprise import SVD
import numpy as np
from surprise import Reader, accuracy
from surprise import Dataset as SurpDataset
import pandas as pd
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import ColumnsInfo, DataDuplicates, FeatureFeatureCorrelation,IsSingleValue
from deepchecks.utils.json_utils import from_json

In [None]:
movies_df=pd.read_csv("../data/movie_log_25k.csv")
movies_df.head()

In [None]:
no_rows,no_columns=movies_df.shape
no_rows,no_columns

In [None]:
movies_dataset_dc=Dataset(movies_df,cat_features=["movie_name","movie_year"], datetime_name="timestamp",
                         index_name="Unnamed: 0", label="movie_rating", label_type="regression")

# 

### Column Info Test

In [None]:

column_info_check=ColumnsInfo()
# column_info_check.add_condition("num_columns_check",lambda x:len(x.items()))
column_info_check_result=column_info_check.run(dataset=movies_dataset_dc)
column_info_check_result.passed_conditions()

In [None]:
column_info_check_result.show(as_widget=False)

### Data Duplicate Test

In [None]:
data_duplicate_check=DataDuplicates(n_samples=no_rows)
data_duplicate_check.run(movies_dataset_dc)

### Feature Feature Correlation

In [None]:
ff_corr_check=FeatureFeatureCorrelation(n_samples=no_rows)


In [None]:
ff_corr_check.add_condition_max_number_of_pairs_above_threshold(0.7)


In [None]:
ff_corr_check_result=ff_corr_check.run(movies_dataset_dc)

In [None]:
ff_corr_check_result.show(as_widget=False)

In [None]:
ff_corr_check_result.passed_conditions()

In [None]:
# Since Movie year and name are highly coorelated we can remove movie year

### Is Column Single Valued?

In [None]:
isv_check=IsSingleValue(n_samples=no_rows)
isv_check.run(movies_dataset_dc).show(as_widget=False)

### Mixed Data Type

In [None]:
from deepchecks.tabular.checks import MixedDataTypes

In [None]:
mdt_check=MixedDataTypes(n_samples=no_rows)
mdt_check.run(movies_dataset_dc).show(as_widget=False)

### Mixed Nulls

In [None]:
from deepchecks.tabular.checks import MixedNulls

In [None]:
mn_check=MixedNulls(n_samples=no_rows)
mn_check.run(movies_dataset_dc).show(as_widget=False)

In [None]:
# We will now have to remove movie year

In [None]:
movies_df.drop(columns=["movie_year","Unnamed: 0","request","event",], inplace=True)
movies_df.head()

In [None]:
movies_dataset_dc=Dataset(movies_df,label="movie_rating",cat_features=["movie_name"], datetime_name="timestamp",label_type="regression")

### Pecent of Nulls for each column

In [None]:
from deepchecks.tabular.checks.data_integrity import PercentOfNulls

In [None]:
pon_check=PercentOfNulls(n_samples=no_rows)
pon_check.run(movies_dataset_dc)

### String Mismatch

In [None]:
from deepchecks.tabular.checks import StringMismatch

In [None]:
sm_check=StringMismatch(n_samples=no_rows)
sm_check.run(movies_dataset_dc).show(as_widget=False)

In [None]:
from deepchecks.tabular.suites import data_integrity

In [None]:
di_suite=data_integrity()
di_suite_result=di_suite.run(movies_dataset_dc)

In [None]:
type(di_suite_result.get_not_passed_checks())

In [None]:
di_suite_result.show(as_widget=False)

# Train Test Validation

In [None]:
train_data,test_data=movies_dataset_dc.train_test_split()

In [None]:
type(train_data)

### New Category

In [None]:
from deepchecks.tabular.checks import CategoryMismatchTrainTest

In [None]:
cmtt_check=CategoryMismatchTrainTest(n_samples=no_rows)
cmtt_check.add_condition_new_category_ratio_less_or_equal(0.05)
cmtt_check_result=cmtt_check.run(train_data,test_data)
cmtt_check_result.show(as_widget=False)

In [None]:
cmtt_check_result.passed_conditions()

### Dataset Size Comparison

In [None]:
from deepchecks.tabular.checks import DatasetsSizeComparison

In [None]:
dsc_check=DatasetsSizeComparison()
dsc_check.run(train_data,test_data).show(as_widget=False)

### Date Train Test Leakage Overlap

In [None]:
from deepchecks.tabular.checks import DateTrainTestLeakageOverlap

In [None]:
dttlo_check=DateTrainTestLeakageOverlap(n_samples=no_rows)
dttlo_check.run(train_data,test_data).show(as_widget=False)

### String Mismatch Comparison

In [None]:
from  deepchecks.tabular.checks import StringMismatchComparison

In [None]:
smc_check=StringMismatchComparison(n_samples=no_rows)
smc_check.run(train_data,test_data).show(as_widget=False)

### Train Test Feature Drift

In [None]:
from  deepchecks.tabular.checks import TrainTestFeatureDrift

In [None]:
ttfd_check=TrainTestFeatureDrift(n_samples=no_rows)
ttfd_check_result=ttfd_check.run(train_data,test_data)
ttfd_check_result.show(as_widget=False)

In [None]:
ttfd_check_result.reduce_output()

### Train Test Sample Mix Check

In [None]:
from deepchecks.tabular.checks import TrainTestSamplesMix

In [None]:
ttsm_check=TrainTestSamplesMix(n_samples=no_rows)
ttsm_check_result=ttsm_check.run(train_data,test_data)
ttsm_check_result.show(as_widget=False)

# Model Eval

## Model Training

In [None]:
reader = Reader(rating_scale=(1,5))

In [None]:
surp_train_data = SurpDataset.load_from_df(train_data.data[['user_id', 'movie_name', 'movie_rating']], reader)
surp_test_data = SurpDataset.load_from_df(test_data.data[['user_id', 'movie_name', 'movie_rating']], reader)

In [None]:
surp_trainset=surp_train_data.build_full_trainset()
surp_testset=surp_test_data.build_full_trainset()

In [None]:
surp_trainset,surp_testset

In [None]:
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(surp_trainset)


In [None]:
surp_testset_train=surp_trainset.build_testset()
surp_testset_test=surp_testset.build_testset()

In [None]:
surp_train_preds=svd.test(surp_testset_train)
surp_test_preds=svd.test(surp_testset_test)

In [None]:
accuracy.rmse(surp_train_preds)

In [None]:
accuracy.rmse(surp_test_preds)

In [None]:
accuracy.mse(surp_test_preds)

### Model Inference Time

In [None]:
from deepchecks.tabular.checks import ModelInferenceTime

In [None]:
mit_check=ModelInferenceTime(n_samples=no_rows)


In [None]:
mit_check_result=mit_check.run(test_data,model=svd)

In [None]:
mit_check_result.show(as_widget=False)

### Model Info

In [None]:
from deepchecks.tabular.checks import ModelInfo

In [None]:
mi_check=ModelInfo()
mi_check_result=mi_check.run(svd)

In [None]:
mi_check_result.show(as_widget=False)

### Regression Error Distribution & Systematic Error  (may not be for our case)

In [None]:
from deepchecks.tabular.checks import RegressionErrorDistribution

In [None]:
red_check=RegressionErrorDistribution(n_samples=no_rows)

In [None]:
red_check_train_result=red_check.run(train_data,svd)
red_check_train_result.show(as_widget=False)

In [None]:
red_check_test_result=red_check.run(test_data,svd)
red_check_test_result.show(as_widget=False)

### Regression Systematic Error

In [None]:
from deepchecks.tabular.checks import RegressionSystematicError

In [None]:
rse_check=RegressionSystematicError(n_samples=no_rows)

In [None]:
rse_check_result_train=rse_check.run(train_data,svd)
rse_check_result_test=rse_check.run(test_data,svd)

In [None]:
rse_check_result_train.show(as_widget=False)

In [None]:
rse_check_result_test.show(as_widget=False)

### Simple Model Comparison

In [None]:
from deepchecks.tabular.checks import SimpleModelComparison

In [None]:
sdc_check=SimpleModelComparison(strategy="tree", alternative_scorers=['neg_rmse',
                                        'neg_mae',
                                        'r2'], n_samples=20000, max_depth=3)

In [None]:
sdc_check_result=sdc_check.run(train_data,test_data,svd)


In [None]:
sdc_check_result.show(as_widget=False)

### Single Dataset Performance

In [None]:
from deepchecks.tabular.checks import SingleDatasetPerformance

In [None]:
sdp_check=SingleDatasetPerformance(n_samples=no_rows)


In [None]:
sdp_check_result_train=sdp_check.run(train_data,svd)
sdp_check_result_train.show(as_widget=False)

In [None]:
sdp_check_result_test=sdp_check.run(test_data,svd)
sdp_check_result_test.show(as_widget=False)

### Train Test Performance

In [None]:
from deepchecks.tabular.checks import TrainTestPerformance

In [None]:
ttp_check=TrainTestPerformance(n_samples=no_rows,scorers=['neg_root_mean_squared_error',
                                        'neg_mean_absolute_error',
                                        'r2'])

In [None]:
ttp_check_result=ttp_check.run(train_data,test_data,svd)
ttp_check_result.show(as_widget=False)

### Train Test Prediction Drift

In [None]:
from deepchecks.tabular.checks import TrainTestPredictionDrift

In [None]:
ttpd_check=TrainTestPredictionDrift(n_samples=no_rows)

In [None]:
ttpd_check_result=ttpd_check.run(train_data,test_data,svd)
ttpd_check_result.show(as_widget=False)