### What are Mixed Data Types?

Mixed data types is when a column contains both string values and numeric values (either as numeric type or as string like “42.90”). This may indicate a problem in the data collection pipeline, or represent a problem situation for the model’s training.

This checks searches for columns with a mix of strings and numeric values and returns them and their respective ratios.

In [1]:
import pandas as pd
import numpy as np
from deepchecks.tabular.datasets.classification.phishing import load_data

#### Load Dataset

In [2]:
phishing_dataset = load_data(as_train_test=False, data_format='DataFrame')
phishing_dataset

Unnamed: 0,target,month,scrape_date,ext,urlLength,numDigits,numParams,num_%20,num_@,entropy,...,dse,bodyLength,numTitles,numImages,numLinks,specialChars,scriptLength,sbr,bscr,sscr
0,0,1,2019-01-01,net,102,8,0,0,0,-4.384032,...,191,32486,3,5,330,9419,23919,0.736286,0.289940,2.539442
1,0,1,2019-01-01,country,154,60,0,2,0,-3.566515,...,0,16199,0,4,39,2735,794,0.049015,0.168838,0.290311
2,0,1,2019-01-01,net,171,5,11,0,0,-4.608755,...,104,103344,18,9,302,27798,83817,0.811049,0.268985,2.412174
3,0,1,2019-01-01,com,94,10,0,0,0,-4.548921,...,466,34093,11,43,199,9087,19427,0.569824,0.266536,2.137889
4,0,1,2019-01-01,other,95,11,0,0,0,-4.717188,...,928,202,1,0,0,39,0,0.000000,0.193069,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11345,0,1,2020-01-15,country,89,7,0,0,0,-4.254491,...,0,4117,5,0,1,971,1866,0.625302,0.213266,2.932029
11346,0,1,2020-01-15,other,107,13,0,0,0,-4.758879,...,1882,17788,47,58,645,3185,4228,0.291069,0.214348,1.357928
11347,0,1,2020-01-15,com,112,10,0,0,0,-4.723014,...,1011,0,0,0,0,0,0,0.000000,0.000000,0.000000
11348,0,1,2020-01-15,html,111,3,0,0,0,-4.289384,...,265,0,0,0,0,0,0,0.000000,0.000000,0.000000


#### Functions to add mixed data

In [3]:
def insert_new_values_types(col: pd.Series, ratio_to_replace: float, values_list):
    col = col.to_numpy().astype(object)
    indices_to_replace = np.random.choice(range(len(col)), int(len(col) * ratio_to_replace), replace=False)
    new_values = np.random.choice(values_list, len(indices_to_replace))
    col[indices_to_replace] = new_values
    return col


def insert_string_types(col: pd.Series, ratio_to_replace):
    return insert_new_values_types(col, ratio_to_replace, ['a', 'b', 'c'])


def insert_numeric_string_types(col: pd.Series, ratio_to_replace):
    return insert_new_values_types(col, ratio_to_replace, ['1.0', '1', '10394.33'])


def insert_number_types(col: pd.Series, ratio_to_replace):
    return insert_new_values_types(col, ratio_to_replace, [66, 99.9])



In [18]:
phishing_dataset['num_@'] = insert_numeric_string_types(phishing_dataset['num_@'], ratio_to_replace=0.01)
phishing_dataset['ext'] = insert_number_types(phishing_dataset['ext'], ratio_to_replace=0.1)
phishing_dataset['bodyLength'] = insert_string_types(phishing_dataset['bodyLength'], ratio_to_replace=0.5)

Check for MixedDataTypes

In [19]:
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import MixedDataTypes

check = MixedDataTypes()
result = check.run(phishing_dataset)
result

Received a "pandas.DataFrame" instance, initializing "deepchecks.tabular.Dataset" from it
It is recommended to initialize Dataset with categorical features by doing "Dataset(df, cat_features=categorical_list)". No categorical features were passed, therefore heuristically inferring categorical features in the data.
10 categorical features were inferred: target, month, ext, numParams, num_%20, num_@, has_ip... For full list use dataset.cat_features


VBox(children=(HTML(value='<h4><b>Mixed Data Types</b></h4>'), HTML(value='<p>Detect columns which contain a m…

In [5]:
phishing_dataset.columns

Index(['target', 'month', 'scrape_date', 'ext', 'urlLength', 'numDigits',
       'numParams', 'num_%20', 'num_@', 'entropy', 'has_ip', 'hasHttp',
       'hasHttps', 'urlIsLive', 'dsr', 'dse', 'bodyLength', 'numTitles',
       'numImages', 'numLinks', 'specialChars', 'scriptLength', 'sbr', 'bscr',
       'sscr'],
      dtype='object')

In [17]:
phishing_dataset['ext']

0          net
1         99.9
2          net
3          com
4         99.9
         ...  
11345     66.0
11346    other
11347     99.9
11348     html
11349     html
Name: ext, Length: 11350, dtype: object