Importation

In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import os

Pipeline de traitement pour dataset 1

In [40]:
def merge_rows_with_wheat_shift(df):
    """
    First shifts rows where wheat appears in longitude, then merges all pairs of rows.
    Properly preserves all values during the shifting process.
    """
    processed_rows = []
    
    i = 0
    while i < len(df) - 1:  # Process pairs of rows
        # Get current pair of rows
        numeric_row = df.iloc[i].copy()
        categorical_row = df.iloc[i + 1].copy()
        
        # Check if this is a wheat row that needs shifting
        if pd.notna(categorical_row['longitude']) and categorical_row['longitude'].strip().lower() == 'wheat':
            # Save the longitude value before shifting
            longitude_val = categorical_row['longitude']
            
            # Shift the categorical values one column to the right
            categorical_row['city'] = categorical_row['soil']
            categorical_row['soil'] = categorical_row['crop']
            categorical_row['crop'] = longitude_val  # Use saved wheat value
            
            # Clear the longitude field as it's now in crop
            categorical_row['longitude'] = numeric_row['longitude']  # Keep any existing numeric longitude
        
        # Merge categorical values into the numeric row
        categorical_cols = ['month', 'crop', 'soil', 'city']
        for col in categorical_cols:
            if pd.notna(categorical_row[col]):
                numeric_row[col] = categorical_row[col].strip()
        
        # If there was a longitude in the categorical row and it wasn't 'wheat',
        # copy it to the numeric row
        if pd.notna(categorical_row['longitude']):
            numeric_row['longitude'] = categorical_row['longitude']
        
        processed_rows.append(numeric_row)
        i += 2
    
    # If there's a lone last row, add it
    if i == len(df) - 1:
        processed_rows.append(df.iloc[-1])
    
    return pd.DataFrame(processed_rows, columns=df.columns)

def clean_categorical_values(df):
    """Standardizes categorical values by converting to uppercase and removing extra spaces."""
    categorical_cols = ['month', 'crop', 'soil', 'city']
    for col in categorical_cols:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.upper()
    return df

def handle_missing_values(df):
    """Handles missing values in both numeric and categorical columns."""
    # For numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
    
    # For categorical columns
    categorical_cols = ['month', 'crop', 'soil', 'city']
    for col in categorical_cols:
        mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else "UNKNOWN"
        df[col] = df[col].fillna(mode_value)
    
    return df

def create_features(df):
    """Creates new features from existing data."""
    # Temperature ratio (with handling for zero values)
    df['temp_ratio'] = df['Max Temp'] / df['Min Temp'].replace(0, np.nan)
    df['temp_ratio'] = df['temp_ratio'].fillna(df['temp_ratio'].median())
    
    # Humidity/radiation ratio
    df['humidity_rad_ratio'] = df['Humidity'] / df['Rad'].replace(0, np.nan)
    df['humidity_rad_ratio'] = df['humidity_rad_ratio'].fillna(df['humidity_rad_ratio'].median())
    
    # Season mapping
    season_mapping = {
        'DECEMBER': 'WINTER', 'JANUARY': 'WINTER', 'FEBRUARY': 'WINTER',
        'MARCH': 'SPRING', 'APRIL': 'SPRING', 'MAY': 'SPRING',
        'JUNE': 'SUMMER', 'JULY': 'SUMMER', 'AUGUST': 'SUMMER',
        'SEPTEMBER': 'AUTUMN', 'OCTOBER': 'AUTUMN', 'NOVEMBER': 'AUTUMN'
    }
    df['season'] = df['month'].map(season_mapping)
    
    return df

def normalize_and_encode(df):
    """Normalizes numeric features and encodes categorical variables."""
    df_encoded = df.copy()
    
    # Normalize numeric columns
    numeric_cols = ['water req', 'Min Temp', 'Max Temp', 'Humidity', 'Wind', 
                   'Sun', 'Rad', 'Rain', 'altitude', 'latitude', 'longitude',
                   'temp_ratio', 'humidity_rad_ratio']
    
    # Create and fit scaler
    scaler = MinMaxScaler()
    df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
    
    # Encode categorical columns
    categorical_cols = ['month', 'crop', 'soil', 'city', 'season']
    encoders = {}
    for col in categorical_cols:
        encoders[col] = LabelEncoder()
        df_encoded[col] = encoders[col].fit_transform(df_encoded[col])
    
    return df_encoded, encoders

def process_dataset(input_file, output_dir):
    """Main function to process the dataset."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Apply the processing pipeline
    df = merge_rows_with_wheat_shift(df)
    df = clean_categorical_values(df)
    df = handle_missing_values(df)
    df = create_features(df)
    
    # Save the preprocessed but non-normalized version
    df.to_csv(os.path.join(output_dir, 'dataset_preprocessed.csv'), index=False)
    
    # Create normalized version
    df_normalized, encoders = normalize_and_encode(df)
    
    # Save the normalized version
    df_normalized.to_csv(os.path.join(output_dir, 'dataset_normalized.csv'), index=False)
    
    return df, df_normalized, encoders

In [41]:
# Utilisation du pipeline
if __name__ == "__main__":

    input_file = r"Datasets2\dataset1\data_plants.csv" 
    
    output_dir = "Output"
    
    # Unpack all three returned values
    df_raw, df_normalized, encoders = process_dataset(input_file, output_dir)

In [42]:
print(encoders)

{'month': LabelEncoder(), 'crop': LabelEncoder(), 'soil': LabelEncoder(), 'city': LabelEncoder(), 'season': LabelEncoder()}
