In [11]:
import numpy as np
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [12]:
from datasets import load_dataset

ds = load_dataset("aaronjpi/fruits-vegetables-price")

In [None]:
# 1) Convert price columns from strings to numeric values (removing '$')
# 2) Convert averagespread from string percentages to decimal values
# 3) Check for missing values (none visible in this sample)
# 4) Split the data into training and testing

In [14]:
# Data cleaning/processing
df = pd.DataFrame(ds['train'])

null_count = df.isnull().sum()
print("Number of nulls: ", null_count)

# Filter for only strawberry data
strawberry_data = df[df['productname'] == 'Strawberries'].copy()

price_columns = ['farmprice', 'atlantaretail', 'chicagoretail', 'losangelesretail', 'newyorkretail']

# Remove the dollar sign from price values
for column in price_columns:
    strawberry_data[column] = strawberry_data[column].str.replace('$', '').str.strip()

# Convert averagespread from percentage to float
strawberry_data['averagespread'] = (strawberry_data['averagespread'].str.rstrip('%').str.replace(',', '').astype(float) / 100)

# Sort data by date (oldest to newest)
strawberry_data = strawberry_data.sort_values('date')

#Create extra features for the model
# This adds them right to the data
strawberry_data['month'] = strawberry_data['date'].dt.month
strawberry_data['day_of_week'] = strawberry_data['date'].dt.dayofweek
strawberry_data['is_holiday_season'] = ((strawberry_data['month'] >= 11) | (strawberry_data['month'] <= 1)).astype(int)

# Create lag features (previous weeks' prices)
# By including multiple lag periods (1, 2, and 4 weeks) 
# it gives the model with different time periods that help it see
# long term and short term trends
for lag in [1, 2, 4]:
    for column in price_columns:
        strawberry_data[f'{column}_lag_{lag}'] = strawberry_data[column].shift(lag)


Number of nulls:  productname         0
date                0
farmprice           0
atlantaretail       0
chicagoretail       0
losangelesretail    0
newyorkretail       0
averagespread       0
dtype: int64


In [15]:
# Preparing data for perceptron model

# Create binary target: 1 if price increased from previous week, 0 if not
strawberry_data['price_increased'] = (strawberry_data['farmprice'] > strawberry_data['farmprice'].shift(1)).astype(int)

# Prepare features and target
X_perceptron = X_perceptron.apply(pd.to_numeric, errors='coerce')
X_perceptron = X_perceptron.dropna()
y_perceptron = y_perceptron.iloc[X_perceptron.index]

X_perceptron = strawberry_data.drop(['productname', 'date', 'price_increased', 'averagespread'], axis=1)
y_perceptron = strawberry_data['price_increased'].iloc[len(strawberry_data) - len(X_perceptron):]

# Calculate the split point (use the last 20% of data as test set)
train_size = int(len(X_perceptron) * 0.8)

# Split the data chronologically
X_train_perceptron = X_perceptron.iloc[:train_size]
X_test_perceptron = X_perceptron.iloc[train_size:]
y_train_perceptron = y_perceptron.iloc[:train_size]
y_test_perceptron = y_perceptron.iloc[train_size:]

# Scale the features
scaler_perceptron = StandardScaler()
X_train_perceptron = scaler_perceptron.fit_transform(X_train_perceptron)
X_test_perceptron = scaler_perceptron.transform(X_test_perceptron)

ValueError: could not convert string to float: ''

In [None]:
# Preparing data for LSTM model

# Select features for LSTM
features_lstm = ['farmprice', 'atlantaretail', 'chicagoretail', 'losangelesretail', 'newyorkretail', 
                'month', 'day_of_week', 'is_holiday_season']

# Scale the data
scaler_lstm = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler_lstm.fit_transform(data_lstm)

# Create sequences (time windows)
# These are needed for LSTM so it can see change in price over time
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        # For prediction, use farm price
        y.append(data[i + seq_length, 0]) 
    return np.array(X), np.array(y)

seq_length = 4  # 4 weeks for the length of the time windows
X_lstm, y_lstm = create_sequences(scaled_data, seq_length)

# Split into train and test sets
train_size = int(len(X_lstm) * 0.8)
X_train_lstm = X_lstm[:train_size]
X_test_lstm = X_lstm[train_size:]
y_train_lstm = y_lstm[:train_size]
y_test_lstm = y_lstm[train_size:]