# Data Clean

In [1]:
### one example
# load data
import numpy as np
import pandas as pd

import json
data = pd.read_csv("chatDataWithContext.csv")
json_str = data["context"][0]

# Split the string and clean each part
parts = json_str.replace("{", "").replace("}", "").split(",")
cleaned_parts = []
for part in parts:
    key, val = part.split(":")
    key=key.replace(" ","")
    val=val.strip()
    # Check if the value is numeric or "null"
    if not (val.isnumeric() or val == "null"):
        cleaned_parts.append(f'"{key}":"{val}"')
    else:
        cleaned_parts.append(f'"{key}":{val}')

# Construct the cleaned JSON string
cleaned_json_str = "{" + ",".join(cleaned_parts) + "}"

print(cleaned_json_str)
# Parse the cleaned JSON string
parsed_json = json.loads(cleaned_json_str)


{"tailleCm":180,"poidsKg":75,"groupeSanguin":"O+","IMC":"23.1","lieuNaissance":"Paris","adresse":"123 Rue","HTA":"non","diabete":"non","dyslipidemie":"oui","autresAntecedentsFamiliaux":null,"nbGrossesse":null,"nbEnfantsVivants":null,"nbMacrosomies":null,"nbAvortements":null,"nbMortNes":null,"contraceptionUtilisee":null,"ageMenopause":null,"autresAntecedentsGynecoObstetriques":null,"alcoolSemaine":0,"tabacStatus":"non","nbCigaretteParJour":null,"exFumerDate":null,"drogue":"non","autreHabitudeToxique":"non"}


In [2]:
def cleanRow(json_str):
    # Split the string and clean each part
    parts = json_str.replace("{", "").replace("}", "").replace(", ",",").split(",")
    cleaned_parts = []
    for part in parts:
        key, val = part.split(":")
        valtest=val.strip()
        # Check if the value is numeric or "null"
        if valtest == "aucun" :valtest ="null"
        if (valtest.isnumeric() or valtest == "null"):
            cleaned_parts.append(f'"{key}":{valtest}')
        else:
            cleaned_parts.append(f'"{key}":"{valtest}"')

    # Construct the cleaned JSON string
    cleaned_json_str = "{" + ",".join(cleaned_parts) + "}"

    # Parse the cleaned JSON string
    return json.loads(cleaned_json_str)

In [3]:
# apply clean 
df_info = pd.DataFrame.from_dict(data["context"].map(cleanRow).tolist())
df_full=pd.DataFrame.join(df_info,data.drop("context",axis=1),)
# df_full=df_full.drop_duplicates()

In [4]:
df_full=df_full.drop("lieuNaissance",axis=1)

In [5]:
df_full.columns

Index(['tailleCm', 'poidsKg', 'groupeSanguin', 'IMC', 'adresse', 'HTA',
       'diabete', 'dyslipidemie', 'autresAntecedentsFamiliaux', 'nbGrossesse',
       'nbEnfantsVivants', 'nbMacrosomies', 'nbAvortements', 'nbMortNes',
       'contraceptionUtilisee', 'ageMenopause',
       'autresAntecedentsGynecoObstetriques', 'alcoolSemaine', 'tabacStatus',
       'nbCigaretteParJour', 'exFumerDate', 'drogue', 'autreHabitudeToxique',
       'Age', 'sexe', 'Description', 'Ordonnance'],
      dtype='object')

## Clean columns

In [6]:
## taille
df_full["tailleCm"].value_counts()

tailleCm
160    38
165    34
170    31
175    28
180    26
162    25
168    21
178    19
172    16
158    12
176    12
174    11
182     9
164     7
155     6
167     4
163     3
173     3
177     3
185     3
169     2
183     2
159     2
157     1
161     1
166     1
Name: count, dtype: int64

In [7]:
## poids
df_full["poidsKg"].value_counts()

poidsKg
60    34
80    27
70    23
85    23
82    22
58    19
78    19
68    18
65    15
55    15
74    13
62    11
90     9
72     8
64     8
75     8
88     6
59     5
54     5
86     4
50     4
63     4
95     3
77     3
89     2
84     2
67     2
83     2
92     2
79     1
91     1
61     1
69     1
Name: count, dtype: int64

In [8]:
## groupeSanguin
df_full["groupeSanguin"].value_counts()

groupeSanguin
B+     56
O+     53
A+     52
A-     38
O-     36
B-     35
AB+    30
AB-    20
Name: count, dtype: int64

In [9]:
## IMC
df_full["IMC"].value_counts()

IMC
26.2    17
22.9    15
26.8    14
21.5    13
26.1    13
        ..
23.9     1
25.1     1
21.8     1
28.4     1
21.4     1
Name: count, Length: 64, dtype: int64

In [10]:
## HTA
df_full["HTA"].value_counts()

HTA
non    183
oui    137
Name: count, dtype: int64

In [11]:
## diabete
df_full["diabete"].value_counts()

diabete
non    240
oui     80
Name: count, dtype: int64

In [12]:
## dyslipidemie
df_full["dyslipidemie"].value_counts()

dyslipidemie
non    174
oui    146
Name: count, dtype: int64

In [13]:
## autresAntecedentsFamiliaux
print(df_full["autresAntecedentsFamiliaux"].value_counts())
df_full["autresAntecedentsFamiliaux"]=df_full["autresAntecedentsFamiliaux"].replace(['null', 'aucun'], np.nan)
df_full["autresAntecedentsFamiliaux"].value_counts()

autresAntecedentsFamiliaux
hypertension            99
diabète                 34
infarctus               12
dyslipidemie            12
AVC                      8
diabete                  4
hypercholestérolémie     2
cardiovasculaire         2
Name: count, dtype: int64


autresAntecedentsFamiliaux
hypertension            99
diabète                 34
infarctus               12
dyslipidemie            12
AVC                      8
diabete                  4
hypercholestérolémie     2
cardiovasculaire         2
Name: count, dtype: int64

In [14]:
## nbGrossesse
print(df_full["nbGrossesse"].value_counts(dropna=False))
df_full["nbGrossesse"] = df_full["nbGrossesse"].replace([np.nan],0)
df_full["nbGrossesse"].value_counts(dropna=False)


nbGrossesse
NaN    163
2.0     51
1.0     50
3.0     35
0.0     19
4.0      2
Name: count, dtype: int64


nbGrossesse
0.0    182
2.0     51
1.0     50
3.0     35
4.0      2
Name: count, dtype: int64

In [15]:
## nbEnfantsVivants
df_full["nbEnfantsVivants"]=df_full["nbEnfantsVivants"].replace(np.nan,0)
df_full["nbEnfantsVivants"].value_counts(dropna=False)

nbEnfantsVivants
0.0    182
2.0     53
1.0     50
3.0     35
Name: count, dtype: int64

In [16]:
## nbMacrosomies
df_full["nbMacrosomies"]=df_full["nbMacrosomies"].replace(np.nan,0)
df_full["nbMacrosomies"].value_counts(dropna=False)

nbMacrosomies
0.0    312
1.0      8
Name: count, dtype: int64

In [17]:
## nbEnfantsVivants
df_full["nbAvortements"]=df_full["nbAvortements"].replace(np.nan,0)
df_full["nbAvortements"].value_counts(dropna=False)

nbAvortements
0.0    268
1.0     52
Name: count, dtype: int64

In [18]:
## nbMortNes
df_full["nbMortNes"]=df_full["nbMortNes"].replace(np.nan,0)
df_full["nbMortNes"].value_counts(dropna=False)

nbMortNes
0.0    318
1.0      2
Name: count, dtype: int64

In [19]:
## contraceptionUtilisee
df_full["contraceptionUtilisee"]=df_full["contraceptionUtilisee"].replace(np.nan,"null")
df_full["contraceptionUtilisee"].value_counts(dropna=False)

contraceptionUtilisee
null           167
pilule          67
stérilet        28
implant         25
DIU             23
patch            8
préservatif      2
Name: count, dtype: int64

In [20]:
## ageMenopause
df_full["ageMenopause"]=df_full["ageMenopause"].replace(np.nan,0)
df_full["ageMenopause"].value_counts(dropna=False)


ageMenopause
0.0     303
50.0      6
48.0      5
51.0      4
46.0      2
Name: count, dtype: int64

In [21]:
## autresAntecedentsGynecoObstetriques
df_full["autresAntecedentsGynecoObstetriques"]=df_full["autresAntecedentsGynecoObstetriques"].replace(np.nan,"null")
df_full["autresAntecedentsGynecoObstetriques"].value_counts(dropna=False)

autresAntecedentsGynecoObstetriques
null                304
endométriose          8
fibromes              4
ménorragies           2
fibromes utérins      2
Name: count, dtype: int64

In [22]:
## alcoolSemaine
df_full["alcoolSemaine"].value_counts(dropna=False)

alcoolSemaine
1    94
2    76
3    62
4    51
5    25
0     6
6     4
7     2
Name: count, dtype: int64

In [23]:
## tabacStatus
# df_full["tabacStatus"]=df_full["tabacStatus"].replace(np.nan,"null")
df_full["tabacStatus"]=df_full["tabacStatus"].replace(["non"],"non-fumeur")
df_full["tabacStatus"]=df_full["tabacStatus"].replace(["oui"],"non-fumeur")
df_full["tabacStatus"].value_counts(dropna=False)


tabacStatus
non-fumeur    268
fumeur         32
ex-fumeur      20
Name: count, dtype: int64

In [24]:
## nbCigaretteParJour
df_full["nbCigaretteParJour"]=df_full["nbCigaretteParJour"].replace(np.nan,0)
df_full["nbCigaretteParJour"].value_counts(dropna=False)

nbCigaretteParJour
0.0     206
10.0     32
15.0     30
20.0     25
12.0     12
5.0      10
25.0      3
8.0       2
Name: count, dtype: int64

In [25]:
## exFumerDate
df_full=df_full.drop(["exFumerDate"],axis=1)

In [26]:
## autreHabitudeToxique
# df_full["autreHabitudeToxique"]=df_full["nbCigaretteParJour"].replace(np.nan,0)
df_full["autreHabitudeToxique"].value_counts(dropna=False)


autreHabitudeToxique
non    320
Name: count, dtype: int64

In [27]:
drop = ["autreHabitudeToxique","autresAntecedentsGynecoObstetriques","autresAntecedentsFamiliaux","contraceptionUtilisee"]
df_full=df_full.drop(drop,axis=1)
df_full.columns

Index(['tailleCm', 'poidsKg', 'groupeSanguin', 'IMC', 'adresse', 'HTA',
       'diabete', 'dyslipidemie', 'nbGrossesse', 'nbEnfantsVivants',
       'nbMacrosomies', 'nbAvortements', 'nbMortNes', 'ageMenopause',
       'alcoolSemaine', 'tabacStatus', 'nbCigaretteParJour', 'drogue', 'Age',
       'sexe', 'Description', 'Ordonnance'],
      dtype='object')

In [28]:
df_full.to_csv("data_cleanOutput.csv")