# Making Data


In [35]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Faker
fake = Faker()

# Define some constants and helper functions for generating synthetic data
GENDER_CHOICES = ['male', 'female']
BLOOD_GROUPS = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']
TOBACCO_STATUS = ['current smoker', 'ex-smoker', 'never smoked']

# Define symptoms, diagnoses, and prescriptions
symptoms = ["cough", "fever", "sore throat", "headache", "nausea", "joint pain",
            "shortness of breath", "abdominal pain", "rash", "dizziness", "blurred vision",
            "chest pain", "fatigue", "muscle pain", "swelling"]
diagnoses = {
    "acute bronchitis": ["cough", "fever"],
    "mild asthma": ["shortness of breath", "wheezing"],
    "severe headache": ["headache", "nausea"],
    "joint pain": ["joint pain", "swelling"],
    "chronic back pain": ["back pain", "fatigue"],
    "skin infection": ["rash", "fever"],
    "stomach flu": ["nausea", "abdominal pain"],
    "high blood pressure": ["headache", "dizziness"],
    "migraine": ["headache", "blurred vision"],
    "allergic reaction": ["rash", "shortness of breath"]
}
prescriptions = {
    "acute bronchitis": ["Cough syrup", "Rest"],
    "mild asthma": ["Inhaler", "Bronchodilator"],
    "severe headache": ["Pain relievers", "Hydration"],
    "joint pain": ["Anti-inflammatory drugs", "Physical therapy"],
    "chronic back pain": ["Painkillers", "Physical therapy"],
    "skin infection": ["Antibiotics", "Hydration"],
    "stomach flu": ["Antiemetics", "Hydration"],
    "high blood pressure": ["Antihypertensives", "Lifestyle changes"],
    "migraine": ["Pain relievers", "Rest"],
    "allergic reaction": ["Antihistamines", "Epinephrine"]
}


[nltk_data] Downloading package punkt to C:\Users\Aziz
[nltk_data]     Hlila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aziz
[nltk_data]     Hlila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
# Function to generate a diagnostic and corresponding prescription
def generate_diagnostic():
    diagnosis = random.choice(list(diagnoses.keys()))
    symptoms_list = diagnoses[diagnosis]
    diagnostic_text = f"Patient shows symptoms of {symptoms_list[0]} and {symptoms_list[1]}. Diagnosed with {diagnosis}."
    prescription = prescriptions[diagnosis]
    return diagnostic_text, prescription

# Function to generate a single row of synthetic data
def generate_synthetic_row():
    diagnostic, prescription = generate_diagnostic()
    return {
        "tailleCm": random.randint(150, 200),
        "poidsKg": random.randint(50, 120),
        "groupeSanguin": random.choice(BLOOD_GROUPS),
        "IMC": round(random.uniform(18.5, 30.0), 2),
        "age": random.randint(18, 80),
        "sexe": random.choice(GENDER_CHOICES),
        "HTA": random.choice([True, False]),
        "diabete": random.choice([True, False]),
        "dyslipidemie": random.choice([True, False]),
        "autresAntecedentsFamiliaux": fake.text(max_nb_chars=20),
        "nbGrossesse": random.randint(0, 10),
        "nbEnfantsVivants": random.randint(0, 10),
        "nbMacrosomies": random.randint(0, 10),
        "nbAvortements": random.randint(0, 10),
        "nbMortNes": random.randint(0, 10),
        "contraceptionUtilisee": random.choice([True, False]),
        "ageMenopause": random.randint(45, 55) if random.choice([True, False]) else None,
        "autresAntecedentsGynecoObstetriques": fake.text(max_nb_chars=20),
        "alcoolSemaine": random.randint(0, 10),
        "tabacStatus": random.choice(TOBACCO_STATUS),
        "nbCigaretteParJour": random.randint(0, 40),
        "drogue": random.choice([True, False]),
        "autreHabitudeToxique": fake.text(max_nb_chars=20),
        "diagnostic": diagnostic,
        "prescription": prescription
    }


In [60]:
# Function to generate a DataFrame of synthetic data
def generate_synthetic_data(num_samples):
    data = [generate_synthetic_row() for _ in range(num_samples)]
    return pd.DataFrame(data)

# Generate synthetic data
num_samples = 1000  # Adjust as needed
synthetic_data = generate_synthetic_data(num_samples)


In [61]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [62]:
synthetic_data['clean_autresAntecedentsGynecoObstetriques'] = synthetic_data['autresAntecedentsGynecoObstetriques'].apply(clean_text)
synthetic_data['clean_autreHabitudeToxique'] = synthetic_data['autreHabitudeToxique'].apply(clean_text)
synthetic_data['clean_autresAntecedentsFamiliaux'] = synthetic_data['autresAntecedentsFamiliaux'].apply(clean_text)
synthetic_data['clean_diagnostic'] = synthetic_data['diagnostic'].apply(clean_text)


In [63]:
vectorizerAG = TfidfVectorizer()
X_text = vectorizerAG.fit_transform(synthetic_data['clean_autresAntecedentsGynecoObstetriques'])
vectorizerHT = TfidfVectorizer()
X_text = vectorizerHT.fit_transform(synthetic_data['clean_autreHabitudeToxique'])
vectorizerAF = TfidfVectorizer()
X_text = vectorizerAF.fit_transform(synthetic_data['clean_autresAntecedentsFamiliaux'])
vectorizerDIG = TfidfVectorizer()
X_text = vectorizerDIG.fit_transform(synthetic_data['clean_diagnostic'])


In [64]:
synthetic_data['sexe'] = synthetic_data['sexe'].map({'male': 0, 'female': 1})
synthetic_data = pd.get_dummies(synthetic_data, columns=['groupeSanguin', 'tabacStatus'], drop_first=True)
synthetic_data

Unnamed: 0,tailleCm,poidsKg,IMC,age,sexe,HTA,diabete,dyslipidemie,autresAntecedentsFamiliaux,nbGrossesse,...,clean_diagnostic,groupeSanguin_A-,groupeSanguin_AB+,groupeSanguin_AB-,groupeSanguin_B+,groupeSanguin_B-,groupeSanguin_O+,groupeSanguin_O-,tabacStatus_ex-smoker,tabacStatus_never smoked
0,178,88,23.58,74,0,True,False,True,Summer resource.,4,...,patient shows symptoms cough fever diagnosed a...,True,False,False,False,False,False,False,False,False
1,190,79,29.33,52,1,True,True,True,Should young child.,3,...,patient shows symptoms rash shortness breath d...,True,False,False,False,False,False,False,False,True
2,200,62,18.64,62,1,True,False,False,Bank fill strategy.,6,...,patient shows symptoms back pain fatigue diagn...,False,True,False,False,False,False,False,True,False
3,176,65,29.38,67,1,False,True,False,Current goal gun.,0,...,patient shows symptoms headache nausea diagnos...,False,False,True,False,False,False,False,True,False
4,168,100,28.39,78,0,True,False,True,Couple exist leader.,5,...,patient shows symptoms shortness breath wheezi...,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,167,53,20.47,44,0,False,True,False,Amount couple.,4,...,patient shows symptoms nausea abdominal pain d...,False,True,False,False,False,False,False,False,True
996,176,94,28.27,55,0,True,True,False,Or national charge.,2,...,patient shows symptoms rash shortness breath d...,False,False,False,True,False,False,False,False,True
997,192,82,28.72,76,0,False,False,True,Could century claim.,1,...,patient shows symptoms cough fever diagnosed a...,False,True,False,False,False,False,False,True,False
998,185,88,29.45,38,1,False,False,True,Inside meet person.,7,...,patient shows symptoms cough fever diagnosed a...,False,False,False,False,False,False,False,False,True


In [65]:
patient_features = synthetic_data.drop(columns=['diagnostic', 'prescription', 'clean_diagnostic',"clean_autresAntecedentsGynecoObstetriques","autresAntecedentsGynecoObstetriques","clean_autreHabitudeToxique","autreHabitudeToxique","clean_autresAntecedentsFamiliaux","autresAntecedentsFamiliaux"])
patient_features

Unnamed: 0,tailleCm,poidsKg,IMC,age,sexe,HTA,diabete,dyslipidemie,nbGrossesse,nbEnfantsVivants,...,drogue,groupeSanguin_A-,groupeSanguin_AB+,groupeSanguin_AB-,groupeSanguin_B+,groupeSanguin_B-,groupeSanguin_O+,groupeSanguin_O-,tabacStatus_ex-smoker,tabacStatus_never smoked
0,178,88,23.58,74,0,True,False,True,4,10,...,True,True,False,False,False,False,False,False,False,False
1,190,79,29.33,52,1,True,True,True,3,0,...,False,True,False,False,False,False,False,False,False,True
2,200,62,18.64,62,1,True,False,False,6,9,...,True,False,True,False,False,False,False,False,True,False
3,176,65,29.38,67,1,False,True,False,0,2,...,False,False,False,True,False,False,False,False,True,False
4,168,100,28.39,78,0,True,False,True,5,8,...,False,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,167,53,20.47,44,0,False,True,False,4,3,...,False,False,True,False,False,False,False,False,False,True
996,176,94,28.27,55,0,True,True,False,2,9,...,True,False,False,False,True,False,False,False,False,True
997,192,82,28.72,76,0,False,False,True,1,0,...,False,False,True,False,False,False,False,False,True,False
998,185,88,29.45,38,1,False,False,True,7,10,...,True,False,False,False,False,False,False,False,False,True


In [66]:


# List of numeric and categorical features
numeric_features = ['tailleCm', 'poidsKg', 'IMC', 'nbGrossesse', 'nbEnfantsVivants',
                    'nbMacrosomies', 'nbAvortements', 'nbMortNes', 'ageMenopause',
                    'alcoolSemaine', 'nbCigaretteParJour', 'Age']
categorical_features = ['groupeSanguin', 'HTA', 'diabete', 'dyslipidemie',
                        'tabacStatus',
                        'drogue',]


# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing to numeric and categorical features
X_preprocessed = preprocessor.fit_transform(X)
print("before ",X.shape,"after ",X_preprocessed.shape)
X_combined = np.hstack((X_preprocessed, descriptionsTokenizedPadded))
print(X_combined.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=13)


In [68]:
X_combined = np.hstack((X_text.toarray(), X_patient))

# Train a k-NN model on the combined features
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X_combined)

ValueError: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values