Importation

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import os

Pipeline de traitement pour dataset 1

In [None]:
# Define the processing pipeline 1
def merge_rows_with_wheat_shift(df):
    """
    First shifts rows where wheat appears in longitude, then merges all pairs of rows.
    Properly preserves all values during the shifting process.
    """
    processed_rows = []
    
    i = 0
    while i < len(df) - 1:  # Process pairs of rows
        # Get current pair of rows
        numeric_row = df.iloc[i].copy()
        categorical_row = df.iloc[i + 1].copy()
        
        # Check if this is a wheat row that needs shifting
        if pd.notna(categorical_row['longitude']) and str(categorical_row['longitude']).strip().lower() == 'wheat':
            # For wheat rows:
            # 1. Move the values one column to the right
            categorical_row['city'] = categorical_row['soil']
            categorical_row['soil'] = categorical_row['crop']
            categorical_row['crop'] = 'WHEAT'  # Set crop to WHEAT
            
            # 2. Keep the numeric longitude from the first row if it exists
            if pd.isna(numeric_row['longitude']):
                numeric_row['longitude'] = np.nan  # Set as missing value when no longitude exists
        else:
            # For non-wheat rows:
            # If there's a longitude in the categorical row, use it
            if pd.notna(categorical_row['longitude']):
                numeric_row['longitude'] = categorical_row['longitude']
        
        # Merge categorical values into the numeric row
        categorical_cols = ['month', 'crop', 'soil', 'city']
        for col in categorical_cols:
            if pd.notna(categorical_row[col]):
                numeric_row[col] = categorical_row[col].strip()
        
        processed_rows.append(numeric_row)
        i += 2
    
    # If there's a lone last row, add it
    if i == len(df) - 1:
        processed_rows.append(df.iloc[-1])
    
    result_df = pd.DataFrame(processed_rows, columns=df.columns)
    
    # Convert longitude to numeric, replacing any remaining NaN with mean
    result_df['longitude'] = pd.to_numeric(result_df['longitude'], errors='coerce')
    result_df['longitude'] = result_df['longitude'].fillna(result_df['longitude'].mean())
    
    return result_df


def clean_categorical_values(df):
    """Standardizes categorical values by converting to uppercase and removing extra spaces."""
    categorical_cols = ['month', 'crop', 'soil', 'city']
    for col in categorical_cols:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.upper()
    return df

def handle_missing_values(df):
    """Handles missing values in both numeric and categorical columns."""
    # For numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # For categorical columns
    categorical_cols = ['month', 'crop', 'soil', 'city']
    for col in categorical_cols:
        mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else "UNKNOWN"
        df[col] = df[col].fillna(mode_value)
    
    return df

def create_features(df):
    """Creates new features from existing data."""
    # Temperature ratio (with handling for zero values)
    df['temp_ratio'] = df['Max Temp'] / df['Min Temp'].replace(0, np.nan)
    df['temp_ratio'] = df['temp_ratio'].fillna(df['temp_ratio'].median())
    
    # Humidity/radiation ratio
    df['humidity_rad_ratio'] = df['Humidity'] / df['Rad'].replace(0, np.nan)
    df['humidity_rad_ratio'] = df['humidity_rad_ratio'].fillna(df['humidity_rad_ratio'].median())
    
    # Season mapping
    season_mapping = {
        'DECEMBER': 'WINTER', 'JANUARY': 'WINTER', 'FEBRUARY': 'WINTER',
        'MARCH': 'SPRING', 'APRIL': 'SPRING', 'MAY': 'SPRING',
        'JUNE': 'SUMMER', 'JULY': 'SUMMER', 'AUGUST': 'SUMMER',
        'SEPTEMBER': 'AUTUMN', 'OCTOBER': 'AUTUMN', 'NOVEMBER': 'AUTUMN'
    }
    df['season'] = df['month'].map(season_mapping)
    
    return df

def normalize_and_encode(df):
    """Normalizes numeric features and encodes categorical variables."""
    df_encoded = df.copy()
    
    # Normalize numeric columns
    numeric_cols = ['water req', 'Min Temp', 'Max Temp', 'Humidity', 'Wind', 
                   'Sun', 'Rad', 'Rain', 'altitude', 'latitude', 'longitude',
                   'temp_ratio', 'humidity_rad_ratio']
    
    # Create and fit scaler
    scaler = MinMaxScaler()
    df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
    
    # Encode categorical columns
    categorical_cols = ['month', 'crop', 'soil', 'city', 'season']
    encoders = {}
    for col in categorical_cols:
        encoders[col] = LabelEncoder()
        df_encoded[col] = encoders[col].fit_transform(df_encoded[col])
    
    return df_encoded, encoders

def process_dataset1(input_file, output_dir):
    """Main function to process the dataset."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Apply the processing pipeline
    df = merge_rows_with_wheat_shift(df)
    df = clean_categorical_values(df)
    df = handle_missing_values(df)
    df = create_features(df)
    
    # Save the preprocessed but non-normalized version
    df.to_csv(os.path.join(output_dir, 'dataset_1_preprocessed.csv'), index=False)
    
    # Create normalized version
    df_normalized, encoders = normalize_and_encode(df)
    
    # Save the normalized version
    df_normalized.to_csv(os.path.join(output_dir, 'dataset_1_normalized.csv'), index=False)
    
    return df, df_normalized, encoders

In [None]:
# Utilisation du pipeline 1
if __name__ == "__main__":

    input_file = r"Datasets2\dataset1\data_plants.csv" 
    
    output_dir = "Output"
    
    # Unpack all three returned values
    df_raw, df_normalized, encoders = process_dataset1(input_file, output_dir)

Pipeline de traitement pour dataset 2

In [3]:
#import dataset 2
df2 = pd.read_csv("Datasets2/dataset2/tomates.csv")

# is there missing values?
print(df2.isnull().sum())

simulation_id    0
time             0
water            0
hour             0
dtype: int64


In [10]:
# Define the processing pipeline 2
def convert_timestamp(df):
    """Converts Unix timestamps to datetime and extracts temporal features."""
    # Convert Unix timestamp to datetime
    df['datetime'] = pd.to_datetime(df['time'], unit='s')
    
    # Extract temporal features
    df['date'] = df['datetime'].dt.date
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['year'] = df['datetime'].dt.year
    df['day_of_week'] = df['datetime'].dt.dayofweek
    
    # Drop original time column and datetime (keep date as string)
    df['date'] = df['date'].astype(str)
    df = df.drop(['time', 'datetime'], axis=1)
    
    return df

def categorize_hour(df):
    """Categorizes hours into periods of the day."""
    conditions = [
        (df['hour'] >= 5) & (df['hour'] < 12),
        (df['hour'] >= 12) & (df['hour'] < 17),
        (df['hour'] >= 17) & (df['hour'] < 21),
        (df['hour'] >= 21) | (df['hour'] < 5)
    ]
    periods = ['MORNING', 'AFTERNOON', 'EVENING', 'NIGHT']
    
    df['day_period'] = np.select(conditions, periods, default='UNKNOWN')
    return df

def handle_missing_values(df):
    """Handles any missing values in the dataset."""
    # For numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # For categorical columns (if any were created)
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else "UNKNOWN"
        df[col] = df[col].fillna(mode_value)
    
    return df

def normalize_and_encode(df):
    """Normalizes numeric features while preserving categorical ones."""
    # Identify numeric columns to normalize
    numeric_cols = ['water', 'hour']
    
    # Create and fit scaler
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    # encode categorical columns
    categorical_cols = ['day_period']
    encoders = {}
    for col in categorical_cols:
        encoders[col] = LabelEncoder()
        df[col] = encoders[col].fit_transform(df[col])
    
    return df, scaler

def process_dataset2(input_file, output_dir):
    """Main function to process the tomato dataset."""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Try to read first few lines to check for header
        with open(input_file, 'r') as f:
            first_line = f.readline().strip()
        
        # If first line contains header-like content, skip it
        if 'time' in first_line.lower() or 'simulation' in first_line.lower():
            df = pd.read_csv(input_file, skiprows=1, names=['simulation_id', 'time', 'water', 'hour'])
        else:
            df = pd.read_csv(input_file, names=['simulation_id', 'time', 'water', 'hour'])
        
        # Apply the processing pipeline
        df = convert_timestamp(df)
        df = categorize_hour(df)
        df = handle_missing_values(df)
        
        # Save the preprocessed but non-normalized version
        df.to_csv(os.path.join(output_dir, 'dataset_2_preprocessed.csv'), index=False)
        
        # Create normalized version
        df_normalized, scaler = normalize_and_encode(df)
        
        # Save the normalized version
        df_normalized.to_csv(os.path.join(output_dir, 'dataset_2_normalized.csv'), index=False)
        
        return df, df_normalized, scaler
        
    except Exception as e:
        print(f"Error processing dataset: {str(e)}")
        raise


In [11]:
# Utilisation du pipeline 2
if __name__ == "__main__":
    input_file = r"Datasets2/dataset2/tomates.csv"
    output_dir = "Output"
    
    # Process the dataset
    df_raw, df_normalized, scaler = process_dataset2(input_file, output_dir)

Pipeline de traitement pour dataset 3

In [30]:
from typing import Tuple, Optional, List
import logging

In [34]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
REQUIRED_COLUMNS = {
    'growing_period': ['Crop', 'Total growing period (days)'],
    'water_need': ['Crop', 'Crop water need (mm/total growing period)']
}

def split_range(range_str: str) -> Tuple[float, float]:
    """
    Split a range string formatted as "min-max" into min and max values as floats.
    
    Args:
        range_str: String containing the range in format "min-max"
        
    Returns:
        Tuple of (min_value, max_value) as floats. Returns (NaN, NaN) if invalid.
    """
    try:
        # Handle various formats and clean the input
        range_str = str(range_str).strip().replace(' ', '')
        
        # Handle single values
        if range_str.replace('.', '').isdigit():
            value = float(range_str)
            return value, value
            
        # Handle range values
        if '-' in range_str:
            parts = range_str.split('-')
            if len(parts) == 2:
                min_val = float(parts[0])
                max_val = float(parts[1])
                # Ensure min <= max
                if min_val <= max_val:
                    return min_val, max_val
                
        return np.nan, np.nan
    except Exception as e:
        logging.warning(f"Error processing range '{range_str}': {str(e)}")
        return np.nan, np.nan

def process_ranges(df: pd.DataFrame, col: str, new_min_col: str, new_max_col: str) -> pd.DataFrame:
    """
    Process a column containing range strings into separate min and max columns.
    """
    if col not in df.columns:
        logging.error(f"Column '{col}' not found in DataFrame")
        return df
        
    ranges = df[col].apply(split_range)
    df[new_min_col] = ranges.apply(lambda x: x[0])
    df[new_max_col] = ranges.apply(lambda x: x[1])
    
    return df

def expand_crop_names(crop_name: str) -> List[str]:
    """
    Split compound crop names into individual crops.
    
    Args:
        crop_name: String containing possibly multiple crop names separated by '/'
        
    Returns:
        List of individual crop names
    """
    return [name.strip().upper() for name in crop_name.split('/')]

def process_dataset3(input_file: str, output_dir: str) -> Optional[pd.DataFrame]:
    """
    Process the FAO Website_data.xls dataset with improved crop name handling.
    """
    try:
        # Validate input file
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file not found: {input_file}")
            
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Try reading as tab-delimited file first
        logging.info(f"Reading input file as tab-delimited: {input_file}")
        try:
            df = pd.read_csv(input_file, sep='\t')
        except Exception as e:
            logging.warning(f"Failed to read as tab-delimited, trying Excel format: {str(e)}")
            df = pd.read_excel(input_file, engine='xlrd')
        
        # Split the dataframe into two parts based on non-null values
        df_period = df[['Crop', 'Total growing period (days)']].dropna(subset=['Total growing period (days)'])
        df_water = df[['Crop', 'Crop water need (mm/total growing period)']].dropna(subset=['Crop water need (mm/total growing period)'])
        
        # Rename columns for consistency
        df_period = df_period.rename(columns={
            'Total growing period (days)': 'Total_growing_period'
        })
        
        df_water = df_water.rename(columns={
            'Crop water need (mm/total growing period)': 'Crop_water_need'
        })
        
        # Clean crop names
        df_period['Crop'] = df_period['Crop'].str.strip().str.upper()
        df_water['Crop'] = df_water['Crop'].str.strip().str.upper()
        
        # Expand compound crop names
        period_rows = []
        for _, row in df_period.iterrows():
            for crop in expand_crop_names(row['Crop']):
                new_row = row.copy()
                new_row['Crop'] = crop
                period_rows.append(new_row)
        df_period = pd.DataFrame(period_rows)
        
        water_rows = []
        for _, row in df_water.iterrows():
            for crop in expand_crop_names(row['Crop']):
                new_row = row.copy()
                new_row['Crop'] = crop
                water_rows.append(new_row)
        df_water = pd.DataFrame(water_rows)
        
        # Process range values for both dataframes
        df_period = process_ranges(
            df_period,
            'Total_growing_period',
            'Total_growing_period_min',
            'Total_growing_period_max'
        )
        df_period = df_period.drop('Total_growing_period', axis=1)
        
        df_water = process_ranges(
            df_water,
            'Crop_water_need',
            'Crop_water_need_min',
            'Crop_water_need_max'
        )
        df_water = df_water.drop('Crop_water_need', axis=1)
        
        # Calculate averages for both dataframes
        df_period['Total_growing_period_avg'] = df_period[
            ['Total_growing_period_min', 'Total_growing_period_max']
        ].mean(axis=1)
        
        df_water['Crop_water_need_avg'] = df_water[
            ['Crop_water_need_min', 'Crop_water_need_max']
        ].mean(axis=1)
        
        # Merge the dataframes on Crop name
        df_combined = pd.merge(
            df_period,
            df_water,
            on='Crop',
            how='outer'
        )
        
        # Sort by crop name for better readability
        df_combined = df_combined.sort_values('Crop')
        
        # Save processed data
        output_file = os.path.join(output_dir, 'dataset_3_processed.csv')
        df_combined.to_csv(output_file, index=False)
        logging.info(f"Processed data saved to: {output_file}")
        
        return df_combined
        
    except Exception as e:
        logging.error(f"Error processing dataset: {str(e)}")
        return None


In [35]:

if __name__ == "__main__":
    input_file = r"Datasets2/dataset3/Website_data.xls"
    output_dir = "Output"
    
    try:
        df_processed = process_dataset3(input_file, output_dir)
        if df_processed is not None:
            logging.info("Dataset processing completed successfully")
            logging.info(f"Processed {len(df_processed)} records")
        else:
            logging.error("Dataset processing failed")
    except Exception as e:
        logging.error(f"Pipeline execution failed: {str(e)}")

2025-02-21 18:02:18,848 - INFO - Reading input file as tab-delimited: Datasets2/dataset3/Website_data.xls
2025-02-21 18:02:18,891 - INFO - Processed data saved to: Output\dataset_3_processed.csv
2025-02-21 18:02:18,893 - INFO - Dataset processing completed successfully
2025-02-21 18:02:18,895 - INFO - Processed 40 records


Pipeline de traitement pour dataset 4

In [38]:

# Constants
CROP_TYPE_MAPPING = {
    1: 'Paddy',
    2: 'Ground Nuts'
}

def validate_data(df: pd.DataFrame) -> bool:
    """
    Validate the input dataframe structure and content.
    """
    required_columns = [
        'CropType', 'CropDays', 'Soil Moisture', 'Soil Temperature',
        'Temperature', 'Humidity', 'Irrigation(Y/N)'
    ]
    
    try:
        # Check for required columns
        for col in required_columns:
            if col not in df.columns:
                logging.error(f"Missing required column: {col}")
                return False
        
        # Validate data types
        if not df['CropType'].dtype in ['int64', 'float64']:
            logging.error("CropType column should contain numeric values")
            return False
            
        if not df['Irrigation(Y/N)'].dtype in ['int64', 'float64']:
            logging.error("Irrigation column should contain numeric values")
            return False
            
        return True
        
    except Exception as e:
        logging.error(f"Validation error: {str(e)}")
        return False

def process_dataset4(input_file: str, output_dir: str) -> Optional[pd.DataFrame]:
    """
    Process the irrigation dataset.
    """
    try:
        # Validate input file
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file not found: {input_file}")
            
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Read only the main data part (first 7 columns)
        logging.info(f"Reading input file: {input_file}")
        df = pd.read_excel(input_file, usecols=range(7))
        
        # Validate data structure
        if not validate_data(df):
            raise ValueError("Data validation failed")
        
        # Clean column names
        df.columns = df.columns.str.strip()
        
        # Convert crop types to names
        df['CropType'] = df['CropType'].map(CROP_TYPE_MAPPING)
        
        # Save processed data
        output_file = os.path.join(output_dir, 'dataset_4_processed.csv')
        df.to_csv(output_file, index=False)
        logging.info(f"Processed data saved to: {output_file}")
        
        return df
        
    except Exception as e:
        logging.error(f"Error processing dataset: {str(e)}")
        return None


In [39]:

if __name__ == "__main__":
    input_file = r"Datasets2/dataset4/Project_datasheet_2019-2020.xlsx"
    output_dir = "Output"
    
    try:
        df_processed = process_dataset4(input_file, output_dir)
        if df_processed is not None:
            logging.info("Dataset processing completed successfully")
            logging.info(f"Processed {len(df_processed)} records")
        else:
            logging.error("Dataset processing failed")
    except Exception as e:
        logging.error(f"Pipeline execution failed: {str(e)}")

2025-02-21 22:23:17,446 - INFO - Reading input file: Datasets2/dataset4/Project_datasheet_2019-2020.xlsx
2025-02-21 22:23:17,491 - INFO - Processed data saved to: Output\dataset_4_processed.csv
2025-02-21 22:23:17,491 - INFO - Dataset processing completed successfully
2025-02-21 22:23:17,497 - INFO - Processed 150 records
