# 1. Installation de Apache Spark



In [5]:
# Installartion et configuration PySpark
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, sha2, concat_ws

# Initialisation de la session Spark
spark = SparkSession.builder.appName("OMOP_Provider").getOrCreate()




# 2. Chargement des données

In [6]:
# Chargement des tables IR_ACT_V et IR_SPE_V dans des dataframes spark
ir_act_v = spark.read.csv("./ir_act_v (1).csv", header=True, inferSchema=True)
ir_spe_v = spark.read.csv("./ir_spe_v (1).csv", header=True, inferSchema=True)

# Sélection des colonnes nécessaires et renom
ir_act_v = ir_act_v.select(col("pfs_act_nat").alias("specialty_source_value"))
ir_spe_v = ir_spe_v.select(col("pfs_spe_cod").alias("specialty_source_value"))

# Union des deux sources
specialties = ir_act_v.union(ir_spe_v).distinct()

# 3. Transformation des données :

In [7]:
# Génération de provider_source_value (identique à specialty_source_value)
specialties = specialties.withColumn("provider_source_value", col("specialty_source_value"))

# Génération de provider_id (hachage SHA2 de provider_source_value)
specialties = specialties.withColumn("provider_id", sha2(col("provider_source_value").cast("STRING"), 256))

# Mapping des spécialités aux specialty_concept_id
specialty_mapping_data = [
    (50, 38003810),# pharmacien
    (26, 36682004), #kiniséthérapeute
    (6, 66862007), # médecin generaliste
    (1, 112247003) # médecin généraliste
]

specialty_mapping_df = spark.createDataFrame(specialty_mapping_data, ["specialty_source_value", "specialty_concept_id"])

# Joindre specialties avec specialty_mapping_df pour ajouter specialty_concept_id
specialties = specialties.join(specialty_mapping_df, on="specialty_source_value", how="left")

# Enregistrer le résultat en Parquet
specialties.write.parquet("/path/to/output/provider.parquet", mode="overwrite")

# Afficher le résultat final
specialties.show()

+----------------------+---------------------+--------------------+--------------------+
|specialty_source_value|provider_source_value|         provider_id|specialty_concept_id|
+----------------------+---------------------+--------------------+--------------------+
|                    26|                   26|5f9c4ab08cac7457e...|            36682004|
|                    50|                   50|1a6562590ef19d104...|            38003810|
|                     6|                    6|e7f6c011776e8db7c...|            66862007|
|                     1|                    1|6b86b273ff34fce19...|           112247003|
+----------------------+---------------------+--------------------+--------------------+

