In [None]:
import kfp
from kfp.v2 import dsl
from kfp.v2.dsl import (
    component,
    Input,
    Output,
    Artifact,
    Dataset,
    ClassificationMetrics,
    Metrics,
    HTML,
    Markdown
)

@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2'],
    base_image="python:3.8"
)
def preparation_data_for_clusturing():
    import numpy as np
    import pandas as pd
    insurance = pd.read_csv('data/unsupervised-lerning-data-insurance.csv', sep=';')
    columns_to_drops = ['ID', 'Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 'Date_birth', 'Date_driving_licence','Distribution_channel', 'Policies_in_force','Max_policies', 'Max_products','Date_lapse', 'Lapse', 'Payment', 'Premium', 'Cost_claims_year',
       'N_claims_year', 'N_claims_history', 'R_Claims_history', 'Area', 'Second_driver', 'Year_matriculation', 'Length'  ,'Type_risk' ]
    insurance = insurance.drop(columns= columns_to_drops, axis=1)
    insurance.drop_duplicates(inplace=True)
    insurance = insurance.dropna()
    insurance_type = {
    'P':1,
    'D':2
    }
    insurance['Type_fuel'] = insurance['Type_fuel'].map(insurance_type)
    insurance.to_csv('data/preparation_data_for_clusturing.csv', index=False)
    print("valide")
    
    
@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2','scikit-learn==1.3.2'],
    base_image="python:3.8"
)
def clusturing_and_save_supervised_data():
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    import pandas as pd
    k_value =  3
    insurance = pd.read_csv('data/preparation_data_for_clusturing.csv')
    datascaler = StandardScaler()
    data_numeric= insurance[['Seniority','Power', 'Cylinder_capacity', 'Value_vehicle', 'N_doors', 'Weight']]
    data_insurance_scaled = datascaler.fit_transform(data_numeric)
    kmeans = KMeans(n_clusters=k_value, init='k-means++', max_iter=300, n_init=10, random_state=0)
    insurance['Cluster'] = kmeans.fit_predict(data_insurance_scaled)
    insurance.to_csv('data/unbalanced_supervised-lerning-data-insurance.csv', index=False)
    print("valide")
    
@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2','scikit-learn==1.3.2'],
    base_image="python:3.8"
)
def unbalanced_data_procissing_to_balanced():
    import numpy as np
    import pandas as pd
    from sklearn.utils import resample
    insurance = pd.read_csv('data/unbalanced_supervised-lerning-data-insurance.csv')
    print(insurance['Cluster'].value_counts())
    min_observation_count = insurance['Cluster'].value_counts().min()
    
    balanced_dfs = []

    for cluster_label, group in insurance.groupby('Cluster'):
        
        balanced_cluster = resample(group, replace=False, n_samples=min_observation_count, random_state=42)
    
        balanced_dfs.append(balanced_cluster)

    balanced_insurance = pd.concat(balanced_dfs, ignore_index=True)

    print(balanced_insurance['Cluster'].value_counts())
    print(balanced_insurance.shape)
    print(balanced_insurance.info())
    balanced_insurance.to_csv('data/balanced_supervised-lerning-data-insurance.csv', index=False)

@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2','scikit-learn==1.3.2'],
    base_image="python:3.8"
)
def train_test_split():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    import numpy as np
    import pandas as pd
    import pickle
    balanced_insurance = pd.read_csv('data/balanced_supervised-lerning-data-insurance.csv')
    X = balanced_insurance.drop('Cluster', axis=1)  # Exclude the 'Cluster' column
    y = balanced_insurance['Cluster']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    np.save(f'data/y_train.npy', y_train)
    np.save(f'data/y_test.npy', y_test)
    np.save(f'data/X_train.npy', X_train)
    np.save(f'data/X_test.npy', X_test)
    with open('data/minmaxscaler.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
        
    print("\n---- X_train ----")
    print("\n")
    print(X_train)
    
    print("\n---- X_test ----")
    print("\n")
    print(X_test)
    
    print("\n---- y_train ----")
    print("\n")
    print(y_train)
    
    print("\n---- y_test ----")
    print("\n")
    print(y_test)

@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2','scikit-learn==1.3.2'],
    base_image="python:3.8"
)
def evaluate_models_level0():
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import ExtraTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import BaggingClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.metrics import confusion_matrix
    import numpy as np
    import pandas as pd
    import pickle
    models = {
        'Logistic Regression': LogisticRegression(),
        'Naive Bayes': GaussianNB(),
        'Support Vector Machine': SVC(),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Bagging': BaggingClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'Extra Trees': ExtraTreeClassifier(),
    }

    y_train = np.load(f'data/y_train.npy', allow_pickle=True)
    y_test = np.load(f'data/y_test.npy', allow_pickle=True)
    X_train = np.load(f'data/X_train.npy', allow_pickle=True)
    X_test = np.load(f'data/X_test.npy', allow_pickle=True)

    with open(f'data/minmaxscaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    print(X_train)
    for name, model in models.items():
        try:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            confusion_mat = confusion_matrix(y_test, y_pred)

            print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
            print(f"Confusion Matrix:\n{confusion_mat}\n{'-'*50}")
        except Exception as e:
            print(f"Error occurred for {name}: {e}")


            
@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2','scikit-learn==1.3.2'],
    base_image="python:3.8"
)
def train_model_staking():
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    import numpy as np
    import pandas as pd
    import pickle
    from sklearn.ensemble import StackingClassifier

    y_train = np.load(f'data/y_train.npy',allow_pickle=True)
    y_test = np.load(f'data/y_test.npy', allow_pickle=True)
    X_train = np.load(f'data/X_train.npy', allow_pickle=True)
    X_test = np.load(f'data/X_test.npy', allow_pickle=True)
    with open(f'data/minmaxscaler.pkl','rb') as f:
        scaler = pickle.load(f)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)



    level0 = [
    ('AdaBoost', AdaBoostClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Support Vector Machine', SVC()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Random Forest', RandomForestClassifier())
    ]
    
    # Define level1 classifier
    level1 = LogisticRegression(max_iter=1000)
    
    # Create StackingClassifier
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    
    # Fit the model
    model.fit(X_train, y_train)
    

    with open('data/model.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
        
    # Predictions
    y_pred = model.predict(X_test)
    np.save(f'data/y_pred.npy', y_pred) 
    print(y_pred)
    

@component(
    packages_to_install=['pandas == 2.1.3','numpy == 1.26.2','scikit-learn==1.3.2'],
    base_image="python:3.8"
)
def evaluate_model_staking():
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.metrics import confusion_matrix
    import numpy as np
    import pandas as pd
    import pickle
    
    with open(f'data/model.pkl','rb') as f:
        model = pickle.load(f)
    y_test = np.load(f'data/y_test.npy', allow_pickle=True) 
    X_test = np.load(f'data/X_test.npy', allow_pickle=True)
    y_pred = np.load(f'data/y_pred.npy', allow_pickle=True)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    #matrix confision
    matrix = confusion_matrix(y_test, y_pred)

    print(f"Stacking Classifier - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print(matrix)
    

    
# Define the pipeline
@dsl.pipeline(
   name='Insurance Recomendation deploy in kubflow',
   description='ML pipeline  de System recomendation'
)
# Define parameters to be fed into pipeline
def systeme_recomendation_pipeline(data_path: str):
    preparation_task = preparation_data_for_clusturing()
    clusturing_task = clusturing_and_save_supervised_data().after(preparation_task)
    unbalanced_task = unbalanced_data_procissing_to_balanced().after(clusturing_task)
    train_test_split_task = train_test_split().after(unbalanced_task)
    evaluate_models_task = evaluate_models_level0().after(train_test_split_task)
    train_staking_task = train_model_staking().after(evaluate_models_task)
    evaluate_staking_task = evaluate_model_staking().after(train_staking_task)
    
    # Set cache staleness for each task
    preparation_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    clusturing_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    unbalanced_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    train_test_split_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    evaluate_models_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    train_staking_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    evaluate_staking_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    

In [None]:
kfp.compiler.Compiler().compile(
    pipeline_func=systeme_recomendation_pipeline,
    package_path='system_recomendation_insurance_pipeline1.yaml')

In [None]:


client = kfp.Client(host='http://10.64.140.43.nip.io:9999')
run = client.create_run_from_pipeline_package(
    'system_recomendation_insurance_pipeline1.yaml',
    arguments={
        'recipient': 'World',
    },
)
experiment = client.create_experiment(name='framer1')