In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

def optimize_kmeans(df, k_range=(2, 10), batch_size=100):
    """
    Apply and optimize K-means clustering on a given DataFrame.
    
    Parameters:
    - df: DataFrame, data for clustering
    - k_range: tuple, range of k values to try (inclusive)
    
    Returns:
    - dict, containing optimal k and metrics
    """
    # Initialize variables to store metrics
    k_values = []
    inertias = []
    silhouette_scores = []
    davies_bouldin_scores = []
    
    # Loop through different values of k to find the optimal one
    for k in range(k_range[0], k_range[1]):
        # Fit K-means model
        kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=42, n_init = "auto").fit(df)
        
        # Get cluster labels
        labels = kmeans.labels_
        
        # Calculate metrics
        inertia = kmeans.inertia_
        silhouette = silhouette_score(df, labels)
        davies_bouldin = davies_bouldin_score(df, labels)
        
        # Store metrics
        k_values.append(k)
        inertias.append(inertia)
        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        
    # Finding the optimal k based on metrics
    # Lower inertia and Davies-Bouldin score is better. Higher silhouette score is better.
    optimal_k = k_values[np.argmin(davies_bouldin_scores)]  # Change this based on the metric you prioritize
    
    # Compile metrics
    metrics = {
        'k_values': k_values,
        'inertias': inertias,
        'silhouette_scores': silhouette_scores,
        'davies_bouldin_scores': davies_bouldin_scores,
        'optimal_k': optimal_k
    }
    
    return metrics

In [2]:
import pandas as pd
from functions import find_csv_files, load_csvs_to_dict, sort_and_classify_column, transform_to_days

DATA_PATH = "/Users/typhaine/Documents/Doc_Gorilla/OpenClassroom--Machine-Learning-Engineer/P4/data"

csv_files = find_csv_files(DATA_PATH)

from typing import List
import pandas as pd
import os

def csv_to_gzip_pandas_and_delete(csv_files: List[str]) -> None:
    """
    Convert a list of CSV files to gzipped files and delete the original CSV files.
    
    Parameters:
        csv_files (List[str]): List of paths to the CSV files to be converted.
        
    Returns:
        None: The function performs file operations and does not return any value.
    """
    for csv_file in csv_files:
        # Define the gzipped filename based on the original csv filename
        gzip_file = f"{csv_file}.gz"
        
        # Read the CSV into a DataFrame
        df = pd.read_csv(csv_file)
        
        # Write the DataFrame to a GZIP file
        df.to_csv(gzip_file, compression='gzip', index=False)
        
        # Delete the original CSV file
        os.remove(csv_file)
        
        print(f"Converted {csv_file} to {gzip_file} and deleted the original file.")
    return

# Example usage:
#csv_files = ['file1.csv', 'file2.csv']
csv_to_gzip_pandas_and_delete(csv_files)

Converted data/olist_sellers_dataset.csv to data/olist_sellers_dataset.csv.gz and deleted the original file.
Converted data/product_category_name_translation.csv to data/product_category_name_translation.csv.gz and deleted the original file.
Converted data/olist_orders_dataset.csv to data/olist_orders_dataset.csv.gz and deleted the original file.
Converted data/olist_order_items_dataset.csv to data/olist_order_items_dataset.csv.gz and deleted the original file.
Converted data/olist_customers_dataset.csv to data/olist_customers_dataset.csv.gz and deleted the original file.
Converted data/olist_geolocation_dataset.csv to data/olist_geolocation_dataset.csv.gz and deleted the original file.
Converted data/olist_order_payments_dataset.csv to data/olist_order_payments_dataset.csv.gz and deleted the original file.
Converted data/olist_order_reviews_dataset.csv to data/olist_order_reviews_dataset.csv.gz and deleted the original file.
Converted data/olist_products_dataset.csv to data/olist_prod

In [2]:
df = pd.read_csv("./tmp_df.csv")



In [3]:
optimize_kmeans(df.values)

: 