# Przygotowanie danych

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## 1. Wczytanie surowych danych

In [2]:
file_path = '../data/raw/crop_yield.csv'
df = pd.read_csv(file_path)

## 2. Rozdzielenie cech oraz celu

In [3]:
X = df.drop("Yield_tons_per_hectare", axis=1)
y = df["Yield_tons_per_hectare"]

### 2.1. Określenie kolumn:

In [4]:
numerical_features = ["Rainfall_mm", "Temperature_Celsius", "Days_to_Harvest"]
categorical_features = ["Region", "Soil_Type", "Crop", "Weather_Condition"]
boolean_features = ["Fertilizer_Used", "Irrigation_Used"]

### 2.2. Określenie pipelinu numerycznego:

In [5]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

### 2.3. Określenie pipelinu kategorycznego:

In [6]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

### 2.4. Określenie pipelinu boolowskiego:

In [7]:
boolean_transformer = Pipeline(steps=[
    ("to_int", FunctionTransformer(lambda x: x.astype(int))),
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

### 2.5. Scalenie wszystkiego w jeden preprocesor:

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features),
    ("bool", boolean_transformer, boolean_features)
])

## 3. Podział danych na zbiór treningowy oraz zbiór testowy

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 3.1. Zastosowanie preprocessingu:

In [10]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

#### Zbiór treningowy prezentuje się następująco:

In [11]:
print("Zbiór treningowy:", X_train_processed.shape)

Zbiór treningowy: (800000, 24)


#### Natomiast ogólny kształt zbioru testowego wygląda tak:

In [12]:
print("Zbiór testowy:", X_test_processed.shape)

Zbiór testowy: (200000, 24)


### 3.2. Zapisanie zbioru treningowego oraz zbioru testowego do plików CSV:

In [14]:
train_df = pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed)
train_df["Yield_tons_per_hectare"] = y_train.values

test_df = pd.DataFrame(X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed)
test_df["Yield_tons_per_hectare"] = y_test.values

train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)