In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import split, col

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = "--conf spark.driver.memory=2g pyspark-shell"
import findspark
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("doc_processor") \
    .getOrCreate()
sc = spark.sparkContext
def tfidf_embeddings(chunks_df):
    # Tokeniser le texte en mots
    tokenized = chunks_df.withColumn("tokens", split(col("text"), " "))
    
    # TF - Term Frequency
    cv = CountVectorizer(inputCol="tokens", outputCol="raw_features")
    cv_model = cv.fit(tokenized)
    featurized = cv_model.transform(tokenized)
    
    # IDF - Inverse Document Frequency
    idf = IDF(inputCol="raw_features", outputCol="tfidf_embedding")
    idf_model = idf.fit(featurized)
    tfidf_df = idf_model.transform(featurized)
    
    return tfidf_df.select("file_name", "chunk_id", "text", "tfidf_embedding")
    
def process_tfidf_embeddings():
    # Charger les chunks (fichier Parquet contenant : file_name, chunk_id, text)
    chunks_df = spark.read.text("chunked_text.txt")
    
    # Appliquer TF-IDF
    embedded = tfidf_embeddings(chunks_df)
    
    # Sauvegarder le résultat
    output_path = "embedded_chunks_tfidf"
    embedded.write.mode("overwrite").parquet(output_path)
    print(f"TF-IDF embeddings saved to {output_path}")

    return embedded

# Lancer l'embedding TF-IDF
process_tfidf_embeddings()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `text` cannot be resolved. Did you mean one of the following? [`value`].;
'Project [value#20, split('text,  , -1) AS tokens#22]
+- Relation [value#20] text


In [16]:
from pyspark.sql import SparkSession

# Initialiser Spark
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test_lecture") \
    .getOrCreate()

# 1. Méthode 1: Si vous avez des fichiers Parquet
try:
    chunks_df = spark.read.text("chunked_text.txt")
    print("Lecture depuis Parquet réussie")
except:
    print("Échec de lecture depuis Parquet, essayez la méthode 2")

# Afficher le schéma
print("\nSchéma du DataFrame:")
chunks_df.printSchema()

# Afficher les 5 premières lignes
print("\nAperçu des données:")
chunks_df.show(5, truncate=50)

# Vérifier les colonnes disponibles
print("\nColonnes disponibles:")
print(chunks_df.columns)

# Vérification spécifique pour TF-IDF
required_columns = {'file_name', 'chunk_id', 'text'}
if not required_columns.issubset(chunks_df.columns):
    print("\nATTENTION: Colonnes manquantes pour TF-IDF")
    print(f"Colonnes requises: {required_columns}")
    print(f"Colonnes disponibles: {chunks_df.columns}")
else:
    print("\nLe DataFrame contient toutes les colonnes nécessaires pour TF-IDF")

Lecture depuis Parquet réussie

Schéma du DataFrame:
root
 |-- value: string (nullable = true)


Aperçu des données:
+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|                           file_name,chunk_id,text|
|0-prelim.pdf,0,"infoh515 big data distributed d...|
|Stream Algos.pdf,0,"chapter 4 mining data strea...|
|Stream Algos.pdf,1,"low would stress modern tec...|
|Stream Algos.pdf,2,"simple summary maximum stre...|
+--------------------------------------------------+
only showing top 5 rows


Colonnes disponibles:
['value']

ATTENTION: Colonnes manquantes pour TF-IDF
Colonnes requises: {'file_name', 'text', 'chunk_id'}
Colonnes disponibles: ['value']
