In [1]:

import pyspark
from pyspark.ml import clustering, evaluation
from pyspark.sql import SparkSession


In [2]:
import os

os.environ['SPARK_HOME'] = "/root/spark"
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-21-openjdk-amd64'

In [3]:
from pydantic import BaseModel
import yaml


class SparkSessionConfig(BaseModel):
    app_name: str
    deploy_mode: str
    driver_memory: str
    executor_memory: str
    
    
class PathsConfig(BaseModel):
    data: str
    model: str
    
    
class KMeansConfig(BaseModel):
    k: int
    maxIter: int
    seed: int

    
class Config(BaseModel):
    spark: SparkSessionConfig
    paths: PathsConfig
    kmeans: KMeansConfig
    

def load_config(file_path: str) -> Config:
    with open(file_path, 'r') as file:
        config_data = yaml.safe_load(file)
    return Config(**config_data)
    # return config_data
    

config = load_config("../configs/config.yaml")
print(config)

spark=SparkSessionConfig(app_name='my_kmeans', deploy_mode='local', driver_memory='1g', executor_memory='2g') paths=PathsConfig(data='../data/subset.csv', model='../model/my_kmeans') kmeans=KMeansConfig(k=10, maxIter=20, seed=42)


In [4]:
import findspark

findspark.init()

spark_session = (
    SparkSession.builder.appName(config.spark.app_name)
    .master(config.spark.deploy_mode)
    .config("spark.driver.memory", config.spark.driver_memory)
    .config("spark.executor.memory", config.spark.executor_memory)
    .getOrCreate()
)

24/08/20 14:43:42 WARN Utils: Your hostname, fmrzlvsxry resolves to a loopback address: 127.0.1.1; using 31.128.42.197 instead (on interface eth0)
24/08/20 14:43:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/20 14:43:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
import pyspark.sql
from pyspark.ml.feature import StandardScaler, VectorAssembler



class Preprocessor:
    def __init__(self, spark_session: pyspark.sql.SparkSession, data_path: str):
        self.data_path = data_path
        self.spark_session = spark_session
        self.df = None


    def load_data(self):
        self.df = self.spark_session.read.csv(self.data_path, header=True, inferSchema=True)


    def vectorize(self, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:  
        vec_assembler = VectorAssembler(
            inputCols=df.columns, outputCol="features"
        )
        return vec_assembler.transform(df)
    
    
    def scale(self, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: 
        scaler = StandardScaler(inputCol="features", outputCol="scaled_features").fit(df)
        return scaler.transform(df)
    
    
    def create_df(self) -> pyspark.sql.DataFrame:
        
        self.load_data()

        self.df = self.df.drop('code', 'product_name')

        self.df = self.vectorize(self.df)

        self.df = self.scale(self.df)
        
        return self.df


preprocessor = Preprocessor(spark_session, config.paths.data)
df = preprocessor.create_df()

24/08/20 14:43:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [6]:
model_args = dict(config.kmeans)
model = clustering.KMeans(featuresCol='scaled_features', **model_args)
model = model.fit(df)

24/08/20 14:43:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/20 14:43:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/08/20 14:43:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [7]:
evaluator = evaluation.ClusteringEvaluator(
    predictionCol="prediction",
    featuresCol='scaled_features',
    metricName="silhouette",
    distanceMeasure="squaredEuclidean",
)
output = model.transform(df)
output.show()

+----------+---------------+--------------+----------------+-----------+----------------+----------+------------+-------------+----------------+-----------+----------------+------------------+------------------+---------------+---------------+---------------+-----------------+------------------+-----------------------------------------------------+-----------------------+--------------------+--------------------+----------+
| created_t|last_modified_t|last_updated_t|serving_quantity|additives_n|nutriscore_score|nova_group|completeness| last_image_t|energy-kcal_100g|energy_100g|        fat_100g|saturated-fat_100g|carbohydrates_100g|    sugars_100g|     fiber_100g|  proteins_100g|        salt_100g|       sodium_100g|fruits-vegetables-nuts-estimate-from-ingredients_100g|nutrition-score-fr_100g|            features|     scaled_features|prediction|
+----------+---------------+--------------+----------------+-----------+----------------+----------+------------+-------------+----------------+

In [9]:
print(evaluator.evaluate(output))

0.20486647770752803


In [10]:
model.write().overwrite().save(config.paths.model)

                                                                                