In [1]:
from surprise import SVD
import numpy as np
from surprise import Reader, accuracy
from surprise import Dataset as SurpDataset
import pandas as pd
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import ColumnsInfo, DataDuplicates, FeatureFeatureCorrelation,IsSingleValue
from deepchecks.utils.json_utils import from_json

In [2]:
movies_df=pd.read_csv("../data/movie_log_25k.csv")
movies_df.head()

Unnamed: 0.1,Unnamed: 0,event,timestamp,user_id,request,movie_name,movie_rating,movie_year
0,0,"2022-09-25T16:09:49,51852,GET /rate/the+usual+...",2022-09-25T16:09:49,51852,GET /rate/the+usual+suspects+1995=5,the+usual+suspects+1995,5,1995.0
1,1,"2022-09-25T16:09:55,146497,GET /rate/north+sea...",2022-09-25T16:09:55,146497,GET /rate/north+sea+texas+2011=4,north+sea+texas+2011,4,2011.0
2,2,"2022-09-25T16:09:57,359974,GET /rate/schindler...",2022-09-25T16:09:57,359974,GET /rate/schindlers+list+1993=4,schindlers+list+1993,4,1993.0
3,3,"2022-09-25T16:10,208901,GET /rate/the+shawshan...",2022-09-25T16:10,208901,GET /rate/the+shawshank+redemption+1994=5,the+shawshank+redemption+1994,5,1994.0
4,4,"2022-09-25T16:10:03,281786,GET /rate/my+neighb...",2022-09-25T16:10:03,281786,GET /rate/my+neighbor+totoro+1988=3,my+neighbor+totoro+1988,3,1988.0


In [3]:
no_rows,no_columns=movies_df.shape
no_rows,no_columns

(294086, 8)

In [4]:
movies_dataset_dc=Dataset(movies_df,cat_features=["movie_name","movie_year"], datetime_name="timestamp",
                         index_name="Unnamed: 0", label="movie_rating", label_type="regression")

In [5]:
type(movies_dataset_dc)

deepchecks.tabular.dataset.Dataset

# 

### Column Info Test

In [6]:

column_info_check=ColumnsInfo()
# column_info_check.add_condition("num_columns_check",lambda x:len(x.items()))
column_info_check_result=column_info_check.run(dataset=movies_dataset_dc)
column_info_check_result.passed_conditions()

True

In [8]:
column_info_check_result.show()

VBox(children=(HTML(value='<h4><b>Columns Info</b></h4>'), HTML(value='<p>Return the role and logical type of …

### Data Duplicate Test

In [9]:
data_duplicate_check=DataDuplicates(n_samples=no_rows)
data_duplicate_check.run(movies_dataset_dc)

VBox(children=(HTML(value='<h4><b>Data Duplicates</b></h4>'), HTML(value='<p>Checks for duplicate samples in t…

### Feature Feature Correlation

In [10]:
ff_corr_check=FeatureFeatureCorrelation(n_samples=no_rows)


In [11]:
ff_corr_check.add_condition_max_number_of_pairs_above_threshold(0.7)


FeatureFeatureCorrelation(n_samples=294086)
	Conditions:
		0: Not more than 0 pairs are correlated above 0.7

In [12]:
ff_corr_check_result=ff_corr_check.run(movies_dataset_dc)

In [13]:
ff_corr_check_result

VBox(children=(HTML(value='<h4><b>Feature-Feature Correlation</b></h4>'), HTML(value='<p>    Checks for pairwi…

In [14]:
ff_corr_check_result.passed_conditions()

False

In [15]:
# Since Movie year and name are highly coorelated we can remove movie year

### Is Column Single Valued?

In [16]:
isv_check=IsSingleValue(n_samples=no_rows)
isv_check.run(movies_dataset_dc)

VBox(children=(HTML(value='<h4><b>Single Value in Column</b></h4>'), HTML(value='<p>Check if there are columns…

### Mixed Data Type

In [17]:
from deepchecks.tabular.checks import MixedDataTypes

In [18]:
mdt_check=MixedDataTypes(n_samples=no_rows)
mdt_check.run(movies_dataset_dc)

VBox(children=(HTML(value='<h4><b>Mixed Data Types</b></h4>'), HTML(value='<p>Detect columns which contain a m…

### Mixed Nulls

In [19]:
from deepchecks.tabular.checks import MixedNulls

In [20]:
mn_check=MixedNulls(n_samples=no_rows)
mn_check.run(movies_dataset_dc)

VBox(children=(HTML(value='<h4><b>Mixed Nulls</b></h4>'), HTML(value='<p>Search for various types of null valu…

In [21]:
# We will now have to remove movie year

In [22]:
movies_df.drop(columns=["movie_year","Unnamed: 0","request","event",], inplace=True)
movies_df.head()

Unnamed: 0,timestamp,user_id,movie_name,movie_rating
0,2022-09-25T16:09:49,51852,the+usual+suspects+1995,5
1,2022-09-25T16:09:55,146497,north+sea+texas+2011,4
2,2022-09-25T16:09:57,359974,schindlers+list+1993,4
3,2022-09-25T16:10,208901,the+shawshank+redemption+1994,5
4,2022-09-25T16:10:03,281786,my+neighbor+totoro+1988,3


In [23]:
movies_dataset_dc=Dataset(movies_df,label="movie_rating",cat_features=["movie_name"], datetime_name="timestamp",label_type="regression")

### Pecent of Nulls for each column

In [24]:
from deepchecks.tabular.checks.data_integrity import PercentOfNulls

In [25]:
pon_check=PercentOfNulls(n_samples=no_rows)
pon_check.run(movies_dataset_dc)

VBox(children=(HTML(value='<h4><b>PercentOfNulls</b></h4>'), HTML(value='<p>Percent of \'Null\' values in each…

### String Mismatch

In [26]:
from deepchecks.tabular.checks import StringMismatch

In [27]:
sm_check=StringMismatch(n_samples=no_rows)
sm_check.run(movies_dataset_dc)

VBox(children=(HTML(value='<h4><b>String Mismatch</b></h4>'), HTML(value='<p>Detect different variants of stri…

In [28]:
from deepchecks.tabular.suites import data_integrity

In [29]:
di_suite=data_integrity()
di_suite_result=di_suite.run(movies_dataset_dc)

In [30]:
type(di_suite_result.get_not_passed_checks())

list

# Train Test Validation

In [31]:
train_data,test_data=movies_dataset_dc.train_test_split()

In [32]:
type(train_data)

deepchecks.tabular.dataset.Dataset

### New Category

In [33]:
from deepchecks.tabular.checks import CategoryMismatchTrainTest

In [34]:
cmtt_check=CategoryMismatchTrainTest(n_samples=no_rows)
cmtt_check.add_condition_new_category_ratio_less_or_equal(0.05)
cmtt_check_result=cmtt_check.run(train_data,test_data)
cmtt_check_result

VBox(children=(HTML(value='<h4><b>Category Mismatch Train Test</b></h4>'), HTML(value='<p>Find new categories …

In [35]:
cmtt_check_result.passed_conditions()

True

### Dataset Size Comparison

In [36]:
from deepchecks.tabular.checks import DatasetsSizeComparison

In [37]:
dsc_check=DatasetsSizeComparison()
dsc_check.run(train_data,test_data)

VBox(children=(HTML(value='<h4><b>Datasets Size Comparison</b></h4>'), HTML(value='<p>Verify test dataset size…

### Date Train Test Leakage Overlap

In [38]:
from deepchecks.tabular.checks import DateTrainTestLeakageOverlap

In [39]:
dttlo_check=DateTrainTestLeakageOverlap(n_samples=no_rows)
dttlo_check.run(train_data,test_data)

VBox(children=(HTML(value='<h4><b>Date Train-Test Leakage (overlap)</b></h4>'), HTML(value='<p>Check test data…

### String Mismatch Comparison

In [40]:
from  deepchecks.tabular.checks import StringMismatchComparison

In [41]:
smc_check=StringMismatchComparison(n_samples=no_rows)
smc_check.run(train_data,test_data)

VBox(children=(HTML(value='<h4><b>String Mismatch Comparison</b></h4>'), HTML(value='<p>Detect different varia…

### Train Test Feature Drift

In [42]:
from  deepchecks.tabular.checks import TrainTestFeatureDrift

In [43]:
ttfd_check=TrainTestFeatureDrift(n_samples=no_rows)
ttfd_check_result=ttfd_check.run(train_data,test_data)
ttfd_check_result

VBox(children=(HTML(value='<h4><b>Train Test Feature Drift</b></h4>'), HTML(value='<p>    Calculate drift betw…

In [44]:
ttfd_check_result.reduce_output()



{'Mean Drift Score': 0.002161994647539447}

### Train Test Sample Mix Check

In [45]:
from deepchecks.tabular.checks import TrainTestSamplesMix

In [46]:
ttsm_check=TrainTestSamplesMix(n_samples=no_rows)
ttsm_check_result=ttsm_check.run(train_data,test_data)
ttsm_check_result

VBox(children=(HTML(value='<h4><b>Train Test Samples Mix</b></h4>'), HTML(value='<p>Detect samples in the test…

# Model Eval

## Model Training

In [47]:
reader = Reader(rating_scale=(1,5))

In [48]:
surp_train_data = SurpDataset.load_from_df(train_data.data[['user_id', 'movie_name', 'movie_rating']], reader)
surp_test_data = SurpDataset.load_from_df(test_data.data[['user_id', 'movie_name', 'movie_rating']], reader)

In [49]:
surp_trainset=surp_train_data.build_full_trainset()
surp_testset=surp_test_data.build_full_trainset()

In [50]:
surp_trainset,surp_testset

(<surprise.trainset.Trainset at 0x288aedfd0>,
 <surprise.trainset.Trainset at 0x2888b9af0>)

In [51]:
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(surp_trainset)
# svd.fit(surp_testset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2917de0d0>

In [None]:
# a=surp_trainset.build_anti_testset()

In [52]:
surp_testset_train=surp_trainset.build_testset()
surp_testset_test=surp_testset.build_testset()

In [53]:
surp_train_preds=svd.test(surp_testset_train)
surp_test_preds=svd.test(surp_testset_test)

In [54]:
# accuracy.mse()

In [55]:
accuracy.rmse(surp_train_preds)

RMSE: 0.5547


0.5546664048917587

In [56]:
accuracy.rmse(surp_test_preds)

RMSE: 0.6839


0.6838776752442276

In [57]:
accuracy.mse(surp_test_preds)

MSE: 0.4677


0.46768867469744924

In [58]:
class SurpModelWrapper:
    
    def __init__(self, surp_model):
        self.model=surp_model
        
    def predict(self, data: pd.DataFrame) -> np.ndarray:
        # Implement based on base model's API.
        reader = Reader(rating_scale=(1,5))
#         print(data.columns)
#         data['movie_rating'] = data.get('movie_rating', self.targets) if self.targets else pd.Series([0]*data.shape[0])


#         if "movie_rating" not in data.columns.to_list():
#             print("Missing ratings")
#             if not self.targets.empty:
#                 print("adding given ratings")
#                 data["movie_rating"]=self.targets
#             else:
#                 print("adding ratings as 0")
#                 data["movie_rating"]=pd.Series([0]*data.shape[0])

        data["movie_rating"]=pd.Series([0]*data.shape[0])    
        surp_data = SurpDataset.load_from_df(data[['user_id', 'movie_name', 'movie_rating']], reader)
        surp_test_data=surp_data.build_full_trainset().build_testset()
            
        predictions = self.model.test(surp_test_data)
        return np.array([pred.est for pred in predictions])
        
    def predict_proba(self, data: pd.DataFrame) -> np.ndarray:
        # Implement based on base model's API, only required for classification tasks.
        ...
    @property
    def feature_importances_(self) -> pd.Series:  # optional
        # Return a pandas Series with feature names as index and their corresponding importance as values.
        ...



In [60]:
svd=SurpModelWrapper(svd)

### Model Inference Time

In [61]:
from deepchecks.tabular.checks import ModelInferenceTime

In [63]:
mit_check=ModelInferenceTime(n_samples=no_rows)


In [64]:
mit_check_result=mit_check.run(test_data,model=svd)

In [65]:
mit_check_result

VBox(children=(HTML(value='<h4><b>Model Inference Time</b></h4>'), HTML(value='<p>Measure model average infere…

### Model Info

In [66]:
from deepchecks.tabular.checks import ModelInfo

In [68]:
mi_check=ModelInfo()
mi_check_result=mi_check.run(svd)





In [69]:
mi_check_result

VBox(children=(HTML(value='<h4><b>Model Info</b></h4>'), HTML(value='<p>Summarize given model parameters. <a h…

### Regression Error Distribution & Systematic Error  (may not be for our case)

In [70]:
from deepchecks.tabular.checks import RegressionErrorDistribution

In [71]:
red_check=RegressionErrorDistribution(n_samples=no_rows)

In [72]:
red_check_train_result=red_check.run(train_data,svd)
red_check_train_result

VBox(children=(HTML(value='<h4><b>Regression Error Distribution</b></h4>'), HTML(value='<p>Check for systemati…

In [73]:
red_check_test_result=red_check.run(test_data,svd)
red_check_test_result

VBox(children=(HTML(value='<h4><b>Regression Error Distribution</b></h4>'), HTML(value='<p>Check for systemati…

### Regression Systematic Error

In [74]:
from deepchecks.tabular.checks import RegressionSystematicError

In [76]:
rse_check=RegressionSystematicError(n_samples=no_rows)


RegressionSystematicError check is deprecated and will be removed in future version, please use RegressionErrorDistribution check instead.



In [77]:
rse_check_result_train=rse_check.run(train_data,svd)
rse_check_result_test=rse_check.run(test_data,svd)

In [78]:
rse_check_result_train

VBox(children=(HTML(value='<h4><b>Regression Systematic Error</b></h4>'), HTML(value='<p>Check the regression …

In [79]:
rse_check_result_test

VBox(children=(HTML(value='<h4><b>Regression Systematic Error</b></h4>'), HTML(value='<p>Check the regression …

### Simple Model Comparison

In [80]:
from deepchecks.tabular.checks import SimpleModelComparison

In [81]:
sdc_check=SimpleModelComparison(strategy="tree", alternative_scorers=['neg_rmse',
                                        'neg_mae',
                                        'r2'], n_samples=20000, max_depth=3)


SimpleModelComparison: alternative_scorers is deprecated. Please use scorers instead.



In [None]:
# sdc_check=SimpleModelComparison(strategy="tree", ,max_depth=2, n_samples=5000)

In [82]:
sdc_check_result=sdc_check.run(train_data,test_data,svd)


In [83]:
sdc_check_result

VBox(children=(HTML(value='<h4><b>Simple Model Comparison</b></h4>'), HTML(value='<p>Compare given model score…

### Single Dataset Performance

In [84]:
from deepchecks.tabular.checks import SingleDatasetPerformance

In [85]:
sdp_check=SingleDatasetPerformance(n_samples=no_rows)


In [86]:
sdp_check_result_train=sdp_check.run(train_data,svd)
sdp_check_result_train

VBox(children=(HTML(value='<h4><b>Single Dataset Performance</b></h4>'), HTML(value='<p>Summarize given model …

In [87]:
sdp_check_result_test=sdp_check.run(test_data,svd)
sdp_check_result_test

VBox(children=(HTML(value='<h4><b>Single Dataset Performance</b></h4>'), HTML(value='<p>Summarize given model …

### Train Test Performance

In [88]:
from deepchecks.tabular.checks import TrainTestPerformance

In [89]:
ttp_check=TrainTestPerformance(n_samples=no_rows,scorers=['neg_root_mean_squared_error',
                                        'neg_mean_absolute_error',
                                        'r2'])

In [90]:
ttp_check_result=ttp_check.run(train_data,test_data,svd)
ttp_check_result

VBox(children=(HTML(value='<h4><b>Train Test Performance</b></h4>'), HTML(value='<p>Summarize given model perf…

### Train Test Prediction Drift

In [91]:
from deepchecks.tabular.checks import TrainTestPredictionDrift

In [92]:
ttpd_check=TrainTestPredictionDrift(n_samples=no_rows)

In [93]:
ttpd_check_result=ttpd_check.run(train_data,test_data,svd)
ttpd_check_result.show()

VBox(children=(HTML(value='<h4><b>Train Test Prediction Drift</b></h4>'), HTML(value='<p>    Calculate predict…