# Import

In [1]:
import ast
import itertools
import json
import numpy as np
import pandas as pd
from pandas_common import *

In [2]:
config_pandas_display()

In [3]:
df = pd.DataFrame(data=
                     {'integers': [1, 2, 3, 4, 5], 
                      'floats': [-2.0, -1.0, 0, 1, 1.5], 
                      'integer_array': [[1, 2], ['1', 2], ['1', '2'], [3, 4, 5], [6, 7, 8, 9]], 
                      'str_array': [[], '[]', {}, '{}', ['a','b']],
                      'str_array2': [[], '[]', {}, ["'a'"], ["'a'","'b'"]],
                      'literal_str_array': "[[], ['a'], ['a','b'], '[]', {}]", 
                      'literal_str_array2': '"[[], [a], [a,b], \'[]\', {}]"', 
                      'nones': ['null', '', 'NaN', None, 'None'],
                      'strs': ['a', 'b', 'c', 'd', 'e']
                     })

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
integers              5 non-null int64
floats                5 non-null float64
integer_array         5 non-null object
str_array             5 non-null object
str_array2            5 non-null object
literal_str_array     5 non-null object
literal_str_array2    5 non-null object
nones                 4 non-null object
strs                  5 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 440.0+ bytes


In [4]:
df

Unnamed: 0,integers,floats,integer_array,str_array,str_array2,literal_str_array,literal_str_array2,nones,strs
0,1,-2.0,"[1, 2]",[],[],"[[], ['a'], ['a','b'], '[]', {}]","""[[], [a], [a,b], '[]', {}]""",,a
1,2,-1.0,"[1, 2]",[],[],"[[], ['a'], ['a','b'], '[]', {}]","""[[], [a], [a,b], '[]', {}]""",,b
2,3,0.0,"[1, 2]",{},{},"[[], ['a'], ['a','b'], '[]', {}]","""[[], [a], [a,b], '[]', {}]""",,c
3,4,1.0,"[3, 4, 5]",{},['a'],"[[], ['a'], ['a','b'], '[]', {}]","""[[], [a], [a,b], '[]', {}]""",,d
4,5,1.5,"[6, 7, 8, 9]","[a, b]","['a', 'b']","[[], ['a'], ['a','b'], '[]', {}]","""[[], [a], [a,b], '[]', {}]""",,e


# String => integer / float

In [5]:
df = pd.DataFrame(data={'integers': [1, "1", "'1'", '"1"']})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
integers    4 non-null object
dtypes: object(1)
memory usage: 112.0+ bytes


In [6]:
df = add_type_columns(df)
df

Unnamed: 0,integers,integers_type
0,1,<class 'int'>
1,1,<class 'str'>
2,'1',<class 'str'>
3,"""1""",<class 'str'>


In [7]:
compare_all_list_items(df['integers'])

Unnamed: 0,item1,item2,comparison
0,1,'1',False
1,1,"""'1'""",False
2,1,"'""1""'",False
3,'1',"""'1'""",False
4,'1',"'""1""'",False
5,"""'1'""","'""1""'",False


#### Verify the output in csv file

In [8]:
df.to_csv('tmp-datatypes-int.csv', index=False)

In [9]:
tmp = pd.read_csv('tmp-datatypes-int.csv')
tmp

Unnamed: 0,integers,integers_type
0,1,<class 'int'>
1,1,<class 'str'>
2,'1',<class 'str'>
3,"""1""",<class 'str'>


#### Clean the string

`strip()` > `replace()`

In [10]:
a = "'1'"
a = '"1"'
a = """1"""
a = ''"""'1'"""''
# a = a.replace("'", "").replace('"', "")
a = a.strip("'\"")
a

'1'

In [11]:
df

Unnamed: 0,integers,integers_type
0,1,<class 'int'>
1,1,<class 'str'>
2,'1',<class 'str'>
3,"""1""",<class 'str'>


In [12]:
# suffix = '_type'
# newColLoc = df.columns.get_loc('cleanse')+1
# newCol = 'cleanse'+suffix
# newColValue = df['cleanse'].apply(lambda x: type(x))
# df.insert(newColLoc, column=newCol, value=newColValue)

In [13]:
df['cleanse'] = df.apply(lambda x: convert_string_to_number(x['integers'], 'int'), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,integers,integers_type,cleanse,cleanse_type
0,1,<class 'int'>,1,<class 'int'>
1,1,<class 'str'>,1,<class 'int'>
2,'1',<class 'str'>,1,<class 'int'>
3,"""1""",<class 'str'>,1,<class 'int'>


In [14]:
True | False

True

## Float

In [15]:
df = pd.DataFrame(data={'floats': [1.0, '1.0', '"1.0"', "'1.0'", 1.000000000000001, 1.000000000000001]})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 1 columns):
floats    6 non-null object
dtypes: object(1)
memory usage: 128.0+ bytes


In [16]:
df['cleanse'] = df.apply(lambda x: convert_string_to_number(x['floats'], 'float'), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,floats,floats_type,cleanse,cleanse_type
0,1,<class 'float'>,1.0,<class 'float'>
1,1.0,<class 'str'>,1.0,<class 'float'>
2,"""1.0""",<class 'str'>,1.0,<class 'float'>
3,'1.0',<class 'str'>,1.0,<class 'float'>
4,1,<class 'float'>,1.0,<class 'float'>
5,1,<class 'float'>,1.0,<class 'float'>


In [17]:
compare_all_list_items(df['floats'])

Unnamed: 0,item1,item2,comparison
0,1.0,'1.0',False
1,1.0,"'""1.0""'",False
2,1.0,"""'1.0'""",False
3,1.0,1.000000000000001,False
4,1.0,1.000000000000001,False
5,'1.0',"'""1.0""'",False
6,'1.0',"""'1.0'""",False
7,'1.0',1.000000000000001,False
8,'1.0',1.000000000000001,False
9,"'""1.0""'","""'1.0'""",False


In [18]:
df['cleanse'] = df.apply(lambda x: convert_string_to_number(x['floats'], 'float'), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,floats,floats_type,cleanse,cleanse_type
0,1,<class 'float'>,1.0,<class 'float'>
1,1.0,<class 'str'>,1.0,<class 'float'>
2,"""1.0""",<class 'str'>,1.0,<class 'float'>
3,'1.0',<class 'str'>,1.0,<class 'float'>
4,1,<class 'float'>,1.0,<class 'float'>
5,1,<class 'float'>,1.0,<class 'float'>


# String => Boolean

In [19]:
df = pd.DataFrame(data={'bools': [True, 'True', '"True"', 1, False, 'False', '"False"', 0]})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 1 columns):
bools    8 non-null object
dtypes: object(1)
memory usage: 144.0+ bytes


In [20]:
df = add_type_columns(df)
df

Unnamed: 0,bools,bools_type
0,True,<class 'bool'>
1,True,<class 'str'>
2,"""True""",<class 'str'>
3,1,<class 'int'>
4,False,<class 'bool'>
5,False,<class 'str'>
6,"""False""",<class 'str'>
7,0,<class 'int'>


In [21]:
int(True)

1

In [22]:
compare_all_list_items(df['bools'])

Unnamed: 0,item1,item2,comparison
0,True,'True',False
1,True,"'""True""'",False
2,True,1,True
3,True,False,False
4,True,'False',False
5,True,"'""False""'",False
6,True,0,False
7,'True',"'""True""'",False
8,'True',1,False
9,'True',False,False


In [23]:
df['cleanse'] = df.apply(lambda x: convert_string_to_boolean(x['bools']), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,bools,bools_type,cleanse,cleanse_type
0,True,<class 'bool'>,True,<class 'bool'>
1,True,<class 'str'>,True,<class 'bool'>
2,"""True""",<class 'str'>,True,<class 'bool'>
3,1,<class 'int'>,1,<class 'int'>
4,False,<class 'bool'>,False,<class 'bool'>
5,False,<class 'str'>,False,<class 'bool'>
6,"""False""",<class 'str'>,False,<class 'bool'>
7,0,<class 'int'>,0,<class 'int'>


In [24]:
bool('False')

True

In [25]:
df['cleanse'] = df.apply(lambda x: convert_string_to_boolean(x['bools'], 'int'), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,bools,bools_type,cleanse,cleanse_type
0,True,<class 'bool'>,1,<class 'bool'>
1,True,<class 'str'>,1,<class 'bool'>
2,"""True""",<class 'str'>,1,<class 'bool'>
3,1,<class 'int'>,1,<class 'int'>
4,False,<class 'bool'>,0,<class 'bool'>
5,False,<class 'str'>,0,<class 'bool'>
6,"""False""",<class 'str'>,0,<class 'bool'>
7,0,<class 'int'>,0,<class 'int'>


# String

In [26]:
df = pd.DataFrame(data={'strs': ['String contains a single quote(\') character', 
                                 'String contains a double quote(\") character',
                                 'String contains newline \newline',
                                 'String contains literal backslash (\\) character',
                                 'String contains ASCII horizontal tab \t character',
                                 'String contains ASCII bell (BEL) \a character',
                                 'String contains ASCII backspace (BS) \b character',
                                 'String contains ASCII formfeed (FF) \f character',
                                 'String contains ASCII linefeed (LF) \n character',
                                 'String contains ASCII carriage return (CR) \r character',
                                 'String contains ASCII vertical tab (VT) \v character',
                                 '''String contains a single (\') and a double (") quote''',
                                 '"String has inner double quotes"',
                                 "'String has inner single quotes'",
                                ]})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 1 columns):
strs    14 non-null object
dtypes: object(1)
memory usage: 192.0+ bytes


In [27]:
df

Unnamed: 0,strs
0,String contains a single quote(') character
1,"String contains a double quote("") character"
2,String contains newline \newline
3,String contains literal backslash (\) character
4,String contains ASCII horizontal tab \t character
5,String contains ASCII bell (BEL)  character
6,String contains ASCII backspace (BS)  character
7,String contains ASCII formfeed (FF) character
8,String contains ASCII linefeed (LF) \n character
9,String contains ASCII carriage return (CR) \r character


In [28]:
compare_all_list_items(df['strs'])

Unnamed: 0,item1,item2,comparison
0,"""String contains a single quote(') character""","'String contains a double quote("") character'",False
1,"""String contains a single quote(') character""",'String contains newline \newline',False
2,"""String contains a single quote(') character""",'String contains literal backslash (\\) character',False
3,"""String contains a single quote(') character""",'String contains ASCII horizontal tab \t character',False
4,"""String contains a single quote(') character""",'String contains ASCII bell (BEL) \x07 character',False
5,"""String contains a single quote(') character""",'String contains ASCII backspace (BS) \x08 character',False
6,"""String contains a single quote(') character""",'String contains ASCII formfeed (FF) \x0c character',False
7,"""String contains a single quote(') character""",'String contains ASCII linefeed (LF) \n character',False
8,"""String contains a single quote(') character""",'String contains ASCII carriage return (CR) \r character',False
9,"""String contains a single quote(') character""",'String contains ASCII vertical tab (VT) \x0b character',False


In [29]:
df['cleanse'] = df.apply(lambda x: clean_string_literal(x['strs']), axis=1)
df

Unnamed: 0,strs,cleanse
0,String contains a single quote(') character,String contains a single quote(') character
1,"String contains a double quote("") character","String contains a double quote("") character"
2,String contains newline \newline,String contains newline \newline
3,String contains literal backslash (\) character,String contains literal backslash (\) character
4,String contains ASCII horizontal tab \t character,String contains ASCII horizontal tab \t character
5,String contains ASCII bell (BEL)  character,String contains ASCII bell (BEL)  character
6,String contains ASCII backspace (BS)  character,String contains ASCII backspace (BS)  character
7,String contains ASCII formfeed (FF) character,String contains ASCII formfeed (FF) character
8,String contains ASCII linefeed (LF) \n character,String contains ASCII linefeed (LF) \n character
9,String contains ASCII carriage return (CR) \r character,String contains ASCII carriage return (CR) \r character


# List

In [30]:
df = pd.DataFrame(data={'lists': [
    [],
    [1,2], ['1','2'],
    '[]', "[]",
    '[1,2]', "[1,2]",
    '"[1,2]"', '"[\'1\',\'2\']"',
    ['a','b'], "['a','b']", '"[\'a\',\'b\']"'
]})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 1 columns):
lists    12 non-null object
dtypes: object(1)
memory usage: 176.0+ bytes


In [31]:
df = add_type_columns(df)
df

Unnamed: 0,lists,lists_type
0,[],<class 'list'>
1,"[1, 2]",<class 'list'>
2,"[1, 2]",<class 'list'>
3,[],<class 'str'>
4,[],<class 'str'>
5,"[1,2]",<class 'str'>
6,"[1,2]",<class 'str'>
7,"""[1,2]""",<class 'str'>
8,"""['1','2']""",<class 'str'>
9,"[a, b]",<class 'list'>


In [32]:
compare_all_list_items(df['lists'])

Unnamed: 0,item1,item2,comparison
0,[],"[1, 2]",False
1,[],"['1', '2']",False
2,[],'[]',False
3,[],'[]',False
4,[],"'[1,2]'",False
5,[],"'[1,2]'",False
6,[],"'""[1,2]""'",False
7,[],"'""[\'1\',\'2\']""'",False
8,[],"['a', 'b']",False
9,[],"""['a','b']""",False


In [33]:
df['cleanse'] = df.apply(lambda x: clean_string_literal(x['lists']), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,lists,lists_type,cleanse,cleanse_type
0,[],<class 'list'>,[],<class 'list'>
1,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>
2,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>
3,[],<class 'str'>,[],<class 'str'>
4,[],<class 'str'>,[],<class 'str'>
5,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>
6,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>
7,"""[1,2]""",<class 'str'>,"[1,2]",<class 'str'>
8,"""['1','2']""",<class 'str'>,"['1','2']",<class 'str'>
9,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>


In [34]:
compare_all_list_items(df['cleanse'])

Unnamed: 0,item1,item2,comparison
0,[],"[1, 2]",False
1,[],"['1', '2']",False
2,[],'[]',False
3,[],'[]',False
4,[],"'[1,2]'",False
5,[],"'[1,2]'",False
6,[],"'[1,2]'",False
7,[],"""['1','2']""",False
8,[],"['a', 'b']",False
9,[],"""['a','b']""",False


In [35]:
json.loads(df.iloc[8,2])

JSONDecodeError: Expecting value: line 1 column 2 (char 1)

**json.loads()** is unable to handle "['1','2']"

In [36]:
# ast.literal_eval("[a,b]".strip("'\""))
# json.loads("[a,b]".strip("'\""))

In [37]:
df['cleanse2'] = df.apply(lambda x: convert_string_to_DL(x['lists']), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,lists,lists_type,cleanse,cleanse_type,cleanse2,cleanse2_type
0,[],<class 'list'>,[],<class 'list'>,[],<class 'list'>
1,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>
2,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>
3,[],<class 'str'>,[],<class 'str'>,[],<class 'list'>
4,[],<class 'str'>,[],<class 'str'>,[],<class 'list'>
5,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>
6,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>
7,"""[1,2]""",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>
8,"""['1','2']""",<class 'str'>,"['1','2']",<class 'str'>,"[1, 2]",<class 'list'>
9,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>


In [38]:
df['dumps'] = df.apply(lambda x: json.dumps(x['cleanse2']), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,lists,lists_type,cleanse,cleanse_type,cleanse2,cleanse2_type,dumps,dumps_type
0,[],<class 'list'>,[],<class 'list'>,[],<class 'list'>,[],<class 'str'>
1,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>
2,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[""1"", ""2""]",<class 'str'>
3,[],<class 'str'>,[],<class 'str'>,[],<class 'list'>,[],<class 'str'>
4,[],<class 'str'>,[],<class 'str'>,[],<class 'list'>,[],<class 'str'>
5,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>
6,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>
7,"""[1,2]""",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>
8,"""['1','2']""",<class 'str'>,"['1','2']",<class 'str'>,"[1, 2]",<class 'list'>,"[""1"", ""2""]",<class 'str'>
9,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>,"[""a"", ""b""]",<class 'str'>


In [39]:
df['1st_item'] = df.apply(lambda x: x['cleanse2'][0] if len(x['cleanse2']) > 0 else None, axis=1)
df = add_type_columns(df)
df

Unnamed: 0,lists,lists_type,cleanse,cleanse_type,cleanse2,cleanse2_type,dumps,dumps_type,1st_item,1st_item_type
0,[],<class 'list'>,[],<class 'list'>,[],<class 'list'>,[],<class 'str'>,,<class 'NoneType'>
1,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>,1,<class 'int'>
2,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'list'>,"[""1"", ""2""]",<class 'str'>,1,<class 'str'>
3,[],<class 'str'>,[],<class 'str'>,[],<class 'list'>,[],<class 'str'>,,<class 'NoneType'>
4,[],<class 'str'>,[],<class 'str'>,[],<class 'list'>,[],<class 'str'>,,<class 'NoneType'>
5,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>,1,<class 'int'>
6,"[1,2]",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>,1,<class 'int'>
7,"""[1,2]""",<class 'str'>,"[1,2]",<class 'str'>,"[1, 2]",<class 'list'>,"[1, 2]",<class 'str'>,1,<class 'int'>
8,"""['1','2']""",<class 'str'>,"['1','2']",<class 'str'>,"[1, 2]",<class 'list'>,"[""1"", ""2""]",<class 'str'>,1,<class 'str'>
9,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>,"[a, b]",<class 'list'>,"[""a"", ""b""]",<class 'str'>,a,<class 'str'>


In [40]:
compare_all_list_items(df['cleanse2'])

Unnamed: 0,item1,item2,comparison
0,[],"[1, 2]",False
1,[],"['1', '2']",False
2,[],[],True
3,[],[],True
4,[],"[1, 2]",False
5,[],"[1, 2]",False
6,[],"[1, 2]",False
7,[],"['1', '2']",False
8,[],"['a', 'b']",False
9,[],"['a', 'b']",False


In [41]:
df.to_csv('tmp-datatypes-list.csv', index=False)

# Dictionary

In [42]:
df = pd.DataFrame(data={'dicts': [
    {}, '{}',
    {'a':1, 'b':2}, 
    '{\'a\':1, \'b\':2}', "{'a':1, 'b':2}",
    '"{\'a\':1, \'b\':2}"', "'{\'a\':1, \'b\':2}'"
]})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 1 columns):
dicts    7 non-null object
dtypes: object(1)
memory usage: 136.0+ bytes


In [43]:
df = add_type_columns(df)
df

Unnamed: 0,dicts,dicts_type
0,{},<class 'dict'>
1,{},<class 'str'>
2,"{'a': 1, 'b': 2}",<class 'dict'>
3,"{'a':1, 'b':2}",<class 'str'>
4,"{'a':1, 'b':2}",<class 'str'>
5,"""{'a':1, 'b':2}""",<class 'str'>
6,"'{'a':1, 'b':2}'",<class 'str'>


In [44]:
df.iloc[3,0] == df.iloc[4,0]

True

In [45]:
df['cleanse'] = df.apply(lambda x: clean_string_literal(x['dicts']), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,dicts,dicts_type,cleanse,cleanse_type
0,{},<class 'dict'>,{},<class 'dict'>
1,{},<class 'str'>,{},<class 'str'>
2,"{'a': 1, 'b': 2}",<class 'dict'>,"{'a': 1, 'b': 2}",<class 'dict'>
3,"{'a':1, 'b':2}",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>
4,"{'a':1, 'b':2}",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>
5,"""{'a':1, 'b':2}""",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>
6,"'{'a':1, 'b':2}'",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>


In [46]:
df.to_csv('tmp-datatypes-dict.csv', index=False)

In [47]:
compare_all_list_items(df['dicts'])

Unnamed: 0,item1,item2,comparison
0,{},'{}',False
1,{},"{'a': 1, 'b': 2}",False
2,{},"""{'a':1, 'b':2}""",False
3,{},"""{'a':1, 'b':2}""",False
4,{},"'""{\'a\':1, \'b\':2}""'",False
5,{},"""'{'a':1, 'b':2}'""",False
6,'{}',"{'a': 1, 'b': 2}",False
7,'{}',"""{'a':1, 'b':2}""",False
8,'{}',"""{'a':1, 'b':2}""",False
9,'{}',"'""{\'a\':1, \'b\':2}""'",False


In [48]:
compare_all_list_items(df['cleanse'])

Unnamed: 0,item1,item2,comparison
0,{},'{}',False
1,{},"{'a': 1, 'b': 2}",False
2,{},"""{'a':1, 'b':2}""",False
3,{},"""{'a':1, 'b':2}""",False
4,{},"""{'a':1, 'b':2}""",False
5,{},"""{'a':1, 'b':2}""",False
6,'{}',"{'a': 1, 'b': 2}",False
7,'{}',"""{'a':1, 'b':2}""",False
8,'{}',"""{'a':1, 'b':2}""",False
9,'{}',"""{'a':1, 'b':2}""",False


In [49]:
df['cleanse2'] = df.apply(lambda x: convert_string_to_DL(x['dicts']), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,dicts,dicts_type,cleanse,cleanse_type,cleanse2,cleanse2_type
0,{},<class 'dict'>,{},<class 'dict'>,{},<class 'dict'>
1,{},<class 'str'>,{},<class 'str'>,{},<class 'dict'>
2,"{'a': 1, 'b': 2}",<class 'dict'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{'a': 1, 'b': 2}",<class 'dict'>
3,"{'a':1, 'b':2}",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
4,"{'a':1, 'b':2}",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
5,"""{'a':1, 'b':2}""",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
6,"'{'a':1, 'b':2}'",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>


# JSON

In [50]:
df = pd.DataFrame(data={'jsons': [
    {}, '{}',
    {'a':1, 'b':2}, {"a":1, "b":2},
    '{\'a\':1, \'b\':2}', "{'a':1, 'b':2}",
    '"{\'a\':1, \'b\':2}"', "'{\'a\':1, \'b\':2}'",
    [{}], '[{}]', "[{}]",
    [{'a':1, 'b':2}, {'c':3, 'd':4}], "[{'a':1, 'b':2}, {'c':3, 'd':4}]", ["{'a':1, 'b':2}", "{'c':3, 'd':4}"]
]})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 1 columns):
jsons    14 non-null object
dtypes: object(1)
memory usage: 192.0+ bytes


In [51]:
df['dumps'] = df.apply(lambda x: json.dumps(x['jsons']), axis=1)
df['dumps-loads'] = df.apply(lambda x: json.loads(json.dumps(x['jsons'])), axis=1)
df['cleanse'] = df.apply(lambda x: convert_string_to_DL(x['jsons']), axis=1)
df['dumps2'] = df.apply(lambda x: json.dumps(x['cleanse']), axis=1)
df['dumps-loads2'] = df.apply(lambda x: json.loads(json.dumps(x['cleanse'])), axis=1)
df = add_type_columns(df)
df

Unnamed: 0,jsons,jsons_type,dumps,dumps_type,dumps-loads,dumps-loads_type,cleanse,cleanse_type,dumps2,dumps2_type,dumps-loads2,dumps-loads2_type
0,{},<class 'dict'>,{},<class 'str'>,{},<class 'dict'>,{},<class 'dict'>,{},<class 'str'>,{},<class 'dict'>
1,{},<class 'str'>,"""{}""",<class 'str'>,{},<class 'str'>,{},<class 'dict'>,{},<class 'str'>,{},<class 'dict'>
2,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
3,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
4,"{'a':1, 'b':2}",<class 'str'>,"""{'a':1, 'b':2}""",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
5,"{'a':1, 'b':2}",<class 'str'>,"""{'a':1, 'b':2}""",<class 'str'>,"{'a':1, 'b':2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
6,"""{'a':1, 'b':2}""",<class 'str'>,"""\""{'a':1, 'b':2}\""""",<class 'str'>,"""{'a':1, 'b':2}""",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
7,"'{'a':1, 'b':2}'",<class 'str'>,"""'{'a':1, 'b':2}'""",<class 'str'>,"'{'a':1, 'b':2}'",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>,"{""a"": 1, ""b"": 2}",<class 'str'>,"{'a': 1, 'b': 2}",<class 'dict'>
8,[{}],<class 'list'>,[{}],<class 'str'>,[{}],<class 'list'>,[{}],<class 'list'>,[{}],<class 'str'>,[{}],<class 'list'>
9,[{}],<class 'str'>,"""[{}]""",<class 'str'>,[{}],<class 'str'>,[{}],<class 'list'>,[{}],<class 'str'>,[{}],<class 'list'>


In [52]:
df.columns.get_loc('jsons')

0

In [53]:
json.loads('{"a":1, "b":2}')

{'a': 1, 'b': 2}

In [54]:
json.dumps("{'a': 1, 'b': 2}")

'"{\'a\': 1, \'b\': 2}"'

In [55]:
df.to_csv('tmp-datatypes-json.csv', index=False)