<a href="https://colab.research.google.com/github/aboubacardiallo050/ODC/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import logging

default_args = {
    'start_date': datetime(2023, 1, 1),
    'catchup': False
}

with DAG('iris_classification',
         schedule_interval=None,
         default_args=default_args,
         description='Classification avec Iris et scikit-learn',
         tags=['ml']) as dag:

    def load_and_split():
        iris = load_iris(as_frame=True)
        X = iris.data
        y = iris.target
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train.to_csv('/tmp/X_train.csv', index=False)
        X_test.to_csv('/tmp/X_test.csv', index=False)
        pd.DataFrame(y_train).to_csv('/tmp/y_train.csv', index=False)
        pd.DataFrame(y_test).to_csv('/tmp/y_test.csv', index=False)

    def train_and_evaluate():
        X_train = pd.read_csv('/tmp/X_train.csv')
        X_test = pd.read_csv('/tmp/X_test.csv')
        y_train = pd.read_csv('/tmp/y_train.csv').values.ravel()
        y_test = pd.read_csv('/tmp/y_test.csv').values.ravel()
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        acc = accuracy_score(y_test, predictions)
        logging.info(f'Accuracy: {acc:.4f}')

    load_task = PythonOperator(
        task_id='load_and_split_data',
        python_callable=load_and_split
    )

    train_task = PythonOperator(
        task_id='train_model',
        python_callable=train_and_evaluate
    )

    load_task >> train_task