In [87]:
#!pip install scikit-learn

In [88]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np
import csv, sklearn, os

data_dir = os.path.join(os.getcwd(), 'data', 'wheat.csv')

with open(data_dir, 'r') as file:
    df = pd.read_csv(file)
    df.rename(columns={'value': 'temperature'}, inplace=True)


def normalize_column(data, column_name):
    """
    Normalizes the specified column in the DataFrame using Min-Max scaling.

    Parameters:
    - data: Pandas DataFrame containing the data.
    - column_name: The name of the column to be normalized.

    Returns:
    - A Pandas DataFrame with the specified column normalized.
    """
    min_val = data[column_name].min()
    max_val = data[column_name].max()
    data[column_name + '_normalized'] = (data[column_name] - min_val) / (max_val - min_val)
    return data

# Normalize the 'price', 'production', and 'value' (temperature) columns
norm_df= df.copy()
norm_cols = ['price', 'production', 'temperature']
for column in norm_cols:
    norm_df = normalize_column(norm_df, column)
    
norm_df["weighted_score"] = (norm_df["price_normalized"] + norm_df["production_normalized"] + norm_df["temperature_normalized"])/3
norm_df = norm_df.drop(columns=norm_df.columns[0:4])

norm_df.head()



Unnamed: 0,price_normalized,production_normalized,temperature_normalized,weighted_score
0,0.224839,0.616553,0.0,0.280464
1,0.162741,1.0,0.042947,0.401896
2,1.0,0.182375,0.297433,0.493269
3,0.794433,0.081073,0.511454,0.46232
4,0.956103,0.423607,0.747821,0.709177


In [89]:
# Assuming 'df' is your DataFrame and 'target' is the name of your target column

#random_state=42

# 1. Prepare the data
X = norm_df.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df['weighted_score']  # Target variable

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 3. Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=1000) # Use RandomForestRegressor if it's a regression problem

# 4. Train the model
rf_model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2s = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse} \nRoot Mean Squared Error: {rmse} \nR^2 Score: {r2s}")

Mean Squared Error: 0.005422908594556093 
Root Mean Squared Error: 0.07364040055944898 
R^2 Score: 0.33074802003361214


In [90]:
# Simulate a regression dataset
X = norm_df.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df['weighted_score']  # Target variable

# Create a random forest regressor model
model = RandomForestRegressor(n_estimators=1000)

# Configure the cross-validation procedure
cv = KFold(n_splits=3, shuffle=True)


# Define multiple scoring metrics
scoring = {'MSE': 'neg_mean_squared_error', 'R2': 'r2'}

# Execute the cross-validation procedure using mean squared error
scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)

# Convert scores to positive as cross_val_score returns negative values for MSE to optimize towards zero
mse_scores = -scores['test_MSE']

# Report performance
print(f'Mean Squared Error: {mse_scores.mean():.3f} (+/- {mse_scores.std():.3f})')
print(f'Root Mean Squared Error: {np.sqrt(mse_scores).mean():.3f} (+/- {np.sqrt(mse_scores).std():.3f})')
print(f"R^2 Score: {scores['test_R2'].mean():.3f} (std: {scores['test_R2'].std():.3f})")

Mean Squared Error: 0.018 (+/- 0.015)
Root Mean Squared Error: 0.122 (+/- 0.056)
R^2 Score: -0.023 (std: 0.366)
