# scaling the data 

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(file_path, target_column):
    """
    Loads data from a CSV, splits it into train/validation/test sets,
    and applies StandardScaler to the features.

    Args:
        file_path (str): The path to the CSV file.
        target_column (str): The name of the column to be used as the target/label.

    Returns:
        dict: A dictionary containing the processed data splits
              (X_train, y_train, X_val, y_val, X_test, y_test)
              and the fitted scaler object.
    """
    # 1. Load the dataset from the CSV file
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return None

    print("--- Data loaded successfully. ---")
    print(f"Original dataset shape: {df.shape}")

    # 2. Separate features (X) and target (y)
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # 3. First split: 70% for training, 30% for temp (validation + test)
    # The random_state ensures that the split is the same every time you run it.
    # 💡 FIX: Removed the 'stratify' parameter here.
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # 4. Second split: Split the 30% temp set into 15% validation and 15% test
    # 💡 FIX: Removed the 'stratify' parameter here as well.
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42
    )

    print("\n--- Data split into training, validation, and test sets. ---")
    print(f"Training set:   {X_train.shape[0]} samples (70%)")
    print(f"Validation set: {X_val.shape[0]} samples (15%)")
    print(f"Test set:       {X_test.shape[0]} samples (15%)")

    # 5. Feature Scaling
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_val_scaled = pd.DataFrame(X_val_scaled, columns=X.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

    print("\n--- Feature scaling applied successfully. ---")

    # 6. Store all results in a single dictionary
    preprocessed_data = {
        'X_train': X_train_scaled,
        'y_train': y_train,
        'X_val': X_val_scaled,
        'y_val': y_val,
        'X_test': X_test_scaled,
        'y_test': y_test,
        'scaler': scaler
    }
    
    print("\nPreprocessing complete. Data is ready in the returned dictionary.")
    
    return preprocessed_data

# --- EXAMPLE USAGE ---

# 1. Specify your file path and the name of your target column
csv_file = "../../data analyzing/Concrete_Data_Yeh.csv"
target = 'csMPa'

# 2. Call the function to get the processed data
processed_results = preprocess_data(csv_file, target)

# 3. Now you can easily access all the datasets from the 'processed_results' dictionary
if processed_results:
    print("\n--- Accessing the processed data ---")
    print("\nScaled training features (first 5 rows):")
    print(processed_results['X_train'].head())
    
    print("\nTraining labels (first 5 rows):")
    print(processed_results['y_train'].head())

--- Data loaded successfully. ---
Original dataset shape: (1030, 9)

--- Data split into training, validation, and test sets. ---
Training set:   721 samples (70%)
Validation set: 154 samples (15%)
Test set:       155 samples (15%)

--- Feature scaling applied successfully. ---

Preprocessing complete. Data is ready in the returned dictionary.

--- Accessing the processed data ---

Scaled training features (first 5 rows):
     cement      slag    flyash     water  superplasticizer  coarseaggregate  \
0 -0.828484 -0.855296  0.761701 -0.766488          0.227273         0.415545   
1  0.374823 -0.855296 -0.816913  0.103748         -1.013995         1.136979   
2  0.317566  1.568935 -0.816913 -1.234713          1.352690        -1.551191   
3  0.688809 -0.638541  1.397859 -1.315115          0.789982        -0.405309   
4 -1.130465  1.312252  1.507813 -0.132729          2.130551        -1.730912   

   fineaggregate       age  
0       1.676803 -0.292980  
1       0.141904 -0.633845  
2     