In [83]:
#!pip install scikit-learn

In [84]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np
import csv, sklearn, os

filename = "Wheat.csv"

def load_csv(csv_file):
    data_dir = os.path.join(os.getcwd(), 'data', csv_file)

    with open(data_dir, 'r') as file:
        df = pd.read_csv(file)
        df.rename(columns={'value': 'temperature'}, inplace=True)
        return df

df=load_csv(filename)

def normalize_column(data, column_name):
    """
    Normalizes the specified column in the DataFrame using Min-Max scaling.

    Parameters:
    - data: Pandas DataFrame containing the data.
    - column_name: The name of the column to be normalized.

    Returns:
    - A Pandas DataFrame with the specified column normalized.
    """
    min_val = data[column_name].min()
    max_val = data[column_name].max()
    data[column_name + '_normalized'] = (data[column_name] - min_val) / (max_val - min_val)
    return data

# Normalize the 'price', 'production', and 'value' (temperature) columns
norm_df= df.copy()
norm_df['priceXproduction'] = norm_df['price'] * norm_df['production']
norm_df['price2'] = norm_df['price'] ** 2
norm_df['production2'] = norm_df['production'] ** 2
norm_df['temperature2'] = norm_df['temperature'] ** 2
norm_df['priceXproduction2'] = norm_df['priceXproduction'] ** 2
norm_df['price3'] = norm_df['price'] ** 3
norm_df['production3'] = norm_df['production'] ** 3
norm_df['temperature3'] = norm_df['temperature'] ** 3
norm_df['priceXproduction3'] = norm_df['priceXproduction'] ** 3
norm_cols = norm_df.columns[1:]
for column in norm_cols:
    norm_df = normalize_column(norm_df, column)
    
norm_df = norm_df.filter(like='_normalized')
norm_df['weighted_score'] = norm_df.mean(axis=1)

norm_df.head()



Unnamed: 0,temperature_normalized,production_normalized,price_normalized,priceXproduction_normalized,price2_normalized,production2_normalized,temperature2_normalized,priceXproduction2_normalized,price3_normalized,production3_normalized,temperature3_normalized,priceXproduction3_normalized,weighted_score
0,0.0,0.616553,0.224839,0.634733,0.201437,0.445617,0.002235,0.464424,0.179461,0.309059,0.0,0.326993,0.283779
1,0.042947,1.0,0.162741,1.0,0.144445,1.0,0.0,1.0,0.127467,1.0,0.000106,1.0,0.539809
2,0.297433,0.182375,1.0,0.281432,1.0,0.074561,0.069785,0.132881,1.0,0.026859,0.018541,0.056574,0.345037
3,0.511454,0.081073,0.794433,0.13984,0.772505,0.027207,0.238462,0.051482,0.74969,0.007854,0.116544,0.016481,0.292252
4,0.747821,0.423607,0.956103,0.571659,0.950467,0.247069,0.541523,0.391788,0.944442,0.133999,0.398564,0.256058,0.546925


In [85]:
# Calculate the correlation matrix
corr_matrix = norm_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop highly correlated columns
norm_df_reduced = norm_df.drop(to_drop, axis=1)

norm_df_reduced

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Unnamed: 0,temperature_normalized,production_normalized,price_normalized,weighted_score
0,0.0,0.616553,0.224839,0.283779
1,0.042947,1.0,0.162741,0.539809
2,0.297433,0.182375,1.0,0.345037
3,0.511454,0.081073,0.794433,0.292252
4,0.747821,0.423607,0.956103,0.546925
5,0.937377,0.120499,0.978587,0.504145
6,1.0,0.398637,0.109208,0.396515
7,0.990388,0.0,0.0,0.245038
8,0.856253,0.063448,0.063169,0.21141
9,0.567164,0.05292,0.388651,0.190379


In [86]:
corr_matrix

Unnamed: 0,temperature_normalized,production_normalized,price_normalized,priceXproduction_normalized,price2_normalized,production2_normalized,temperature2_normalized,priceXproduction2_normalized,price3_normalized,production3_normalized,temperature3_normalized,priceXproduction3_normalized,weighted_score
temperature_normalized,1.0,0.530007,0.158106,0.506351,0.169249,0.536337,0.963445,0.5199,0.179416,0.50966,0.920488,0.501434,0.247581
production_normalized,0.530007,1.0,0.15937,0.988079,0.16974,0.972608,0.446333,0.978231,0.178869,0.90909,0.390238,0.926599,0.518812
price_normalized,0.158106,0.15937,1.0,0.016332,0.999605,0.22309,0.005095,0.117209,0.998486,0.24198,0.083132,0.170415,0.513239
priceXproduction_normalized,0.506351,0.988079,0.016332,1.0,0.02641,0.946473,0.445973,0.972606,0.035356,0.875405,0.402228,0.908662,0.600995
price2_normalized,0.169249,0.16974,0.999605,0.02641,1.0,0.232039,0.009053,0.125849,0.999637,0.249196,0.067956,0.177492,0.513975
production2_normalized,0.536337,0.972608,0.22309,0.946473,0.232039,1.0,0.446104,0.990094,0.239853,0.980587,0.389049,0.984068,0.491842
temperature2_normalized,0.963445,0.446333,0.005095,0.445973,0.009053,0.446104,1.0,0.44698,0.02232,0.421921,0.990016,0.425424,0.235233
priceXproduction2_normalized,0.5199,0.978231,0.117209,0.972606,0.125849,0.990094,0.44698,1.0,0.133432,0.9602,0.400281,0.98018,0.563007
price3_normalized,0.179416,0.178869,0.998486,0.035356,0.999637,0.239853,0.02232,0.133432,1.0,0.25543,0.053577,0.183629,0.514683
production3_normalized,0.50966,0.90909,0.24198,0.875405,0.249196,0.980587,0.421921,0.9602,0.25543,1.0,0.36818,0.992771,0.470219


In [87]:
# 1. Prepare the data
X = norm_df_reduced.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df_reduced['weighted_score']  # Target variable

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 3. Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=1000) # Use RandomForestRegressor if it's a regression problem

# 4. Train the model
rf_model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2s = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse} \nRoot Mean Squared Error: {rmse} \nR^2 Score: {r2s}")

Mean Squared Error: 0.0069009895336964834 
Root Mean Squared Error: 0.08307219470855746 
R^2 Score: -2.873823672645195


In [88]:
X = norm_df_reduced.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df_reduced['weighted_score']  # Target variable

# Create a random forest regressor model
model = RandomForestRegressor(n_estimators=1000)

# Configure the cross-validation procedure
cv = KFold(n_splits=3, shuffle=True)


# Define multiple scoring metrics
scoring = {'MSE': 'neg_mean_squared_error', 'R2': 'r2'}

# Execute the cross-validation procedure using mean squared error
scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)

# Convert scores to positive as cross_val_score returns negative values for MSE to optimize towards zero
mse_scores = -scores['test_MSE']

# Report performance
print(f'Mean Squared Error: {mse_scores.mean():.3f} (+/- {mse_scores.std():.3f})')
print(f'Root Mean Squared Error: {np.sqrt(mse_scores).mean():.3f} (+/- {np.sqrt(mse_scores).std():.3f})')
print(f"R^2 Score: {scores['test_R2'].mean():.3f} (std: {scores['test_R2'].std():.3f})")

Mean Squared Error: 0.017 (+/- 0.010)
Root Mean Squared Error: 0.119 (+/- 0.048)
R^2 Score: 0.110 (std: 0.382)


In [89]:
X = norm_df_reduced.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df_reduced['weighted_score']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create a random forest regressor model
rf = RandomForestRegressor()

# Define a grid of parameters to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define multiple scoring metrics
scoring = {'MSE': 'neg_mean_squared_error', 'R2': 'r2'}

# Set up the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="r2")

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Print out the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Get the best estimator and evaluate it on the test set
best_rf = grid_search.best_estimator_
best_rf_score = best_rf.score(X_test, y_test)
print("Test set score of best estimator: ", best_rf_score)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters found:  {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Test set score of best estimator:  -0.11685802351168029


In [90]:
predictions = best_rf.predict(X_train)
predictions, X_train, y_train, X_test, y_test

(array([0.36411701, 0.36411701, 0.36411701, 0.21560919, 0.21560919,
        0.48252838, 0.36411701, 0.48252838, 0.21560919, 0.48252838]),
     temperature_normalized  production_normalized  price_normalized
 0                 0.000000               0.616553          0.224839
 2                 0.297433               0.182375          1.000000
 12                0.050866               0.616553          0.224839
 8                 0.856253               0.063448          0.063169
 9                 0.567164               0.052920          0.388651
 6                 1.000000               0.398637          0.109208
 1                 0.042947               1.000000          0.162741
 4                 0.747821               0.423607          0.956103
 7                 0.990388               0.000000          0.000000
 5                 0.937377               0.120499          0.978587,
 0     0.283779
 2     0.345037
 12    0.287843
 8     0.211410
 9     0.190379
 6     0.396515
 1    

In [91]:
predictions = best_rf.predict(X_train)
predictions, y_train

(array([0.36411701, 0.36411701, 0.36411701, 0.21560919, 0.21560919,
        0.48252838, 0.36411701, 0.48252838, 0.21560919, 0.48252838]),
 0     0.283779
 2     0.345037
 12    0.287843
 8     0.211410
 9     0.190379
 6     0.396515
 1     0.539809
 4     0.546925
 7     0.245038
 5     0.504145
 Name: weighted_score, dtype: float64)

In [92]:
df=load_csv("Soybeans.csv")
for column in df.columns[1:]:
    df = normalize_column(df, column)
    
df = df.filter(like='_normalized')
#df['weighted_score'] = df.mean(axis=1)

df["predictions"] = best_rf.predict(df)
df['actual_weighted_score'] = df.mean(axis=1)

df

Unnamed: 0,temperature_normalized,production_normalized,price_normalized,predictions,actual_weighted_score
0,0.049843,0.668476,0.0,0.364117,0.270609
1,0.091407,1.0,0.456452,0.364117,0.477994
2,0.307906,0.037181,0.832258,0.215609,0.348239
3,0.480675,0.106476,0.725806,0.364117,0.419269
4,0.730981,0.0,0.856452,0.215609,0.45076
5,0.936296,0.132398,1.0,0.482528,0.637806
6,1.0,0.372211,0.612903,0.482528,0.616911
7,0.955764,0.118434,0.667742,0.482528,0.556117
8,0.844343,0.200646,0.509677,0.482528,0.509299
9,0.57029,0.120742,0.182258,0.482528,0.338954


In [95]:
df_soy=load_csv("Soybeans.csv")
df_corn=load_csv("Corn.csv")
df_wheat=load_csv("Wheat.csv")

train_df = pd.concat([df_soy, df_corn, df_wheat], ignore_index=True)
train_df.drop(columns=['date'], inplace=True)


train_df['priceXproduction'] = train_df['price'] * train_df['production']
train_df['price2'] = train_df['price'] ** 2
train_df['production2'] = train_df['production'] ** 2
train_df['temperature2'] = train_df['temperature'] ** 2
train_df['priceXproduction2'] = train_df['priceXproduction'] ** 2
train_df['price3'] = train_df['price'] ** 3
train_df['production3'] = train_df['production'] ** 3
train_df['temperature3'] = train_df['temperature'] ** 3
train_df['priceXproduction3'] = train_df['priceXproduction'] ** 3


for col in train_df.columns:
    train_df = normalize_column(train_df, col)
train_df = train_df.filter(like='_normalized')
train_df['weighted_score'] = train_df.mean(axis=1)


corr_matrix = train_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
train_df_reduced = train_df.drop(to_drop, axis=1)

X = train_df_reduced.drop(columns=['weighted_score'])
y = train_df_reduced['weighted_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="r2")

grid_search.fit(X_train, y_train)


print("Best parameters found: ", grid_search.best_params_)

best_rf = grid_search.best_estimator_
best_rf_score = best_rf.score(X_test, y_test)
print("Test set score of best estimator: ", best_rf_score)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters found:  {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test set score of best estimator:  0.8655375891425501


In [98]:
predictions = best_rf.predict(X_train)
predictions, y_train

(array([0.12182239, 0.31788964, 0.32679929, 0.25168879, 0.11781617,
        0.2359521 , 0.09169025, 0.3395841 , 0.41504361, 0.11370433,
        0.26977956, 0.26088673, 0.14559418, 0.36406606, 0.23798128,
        0.29673408, 0.19903345, 0.25517023, 0.22700774, 0.09226754,
        0.16153774, 0.16938051, 0.22509411, 0.43023681, 0.18618608,
        0.42209033, 0.33545159, 0.24126648, 0.28639754, 0.28498067,
        0.11801469]),
 38    0.120314
 32    0.324580
 5     0.334993
 8     0.236328
 26    0.116936
 25    0.220847
 12    0.080469
 23    0.391516
 19    0.448858
 24    0.108711
 21    0.291412
 34    0.243414
 9     0.101870
 18    0.366589
 13    0.222450
 33    0.290798
 22    0.198683
 16    0.225731
 4     0.218912
 0     0.082072
 36    0.151949
 10    0.163710
 35    0.213326
 14    0.552197
 15    0.157683
 31    0.470666
 6     0.344441
 1     0.205654
 17    0.309286
 28    0.300228
 27    0.118073
 Name: weighted_score, dtype: float64)

In [99]:
predictions = best_rf.predict(X_test)
predictions, y_test

(array([0.32662825, 0.16471733, 0.107223  , 0.28078912, 0.13040362,
        0.15266996, 0.3121327 , 0.32116715]),
 20    0.328768
 2     0.130728
 11    0.051780
 29    0.287345
 37    0.069125
 3     0.148400
 30    0.398314
 7     0.299953
 Name: weighted_score, dtype: float64)

In [103]:
df=load_csv("Wheat.csv")
for column in df.columns[1:]:
    df = normalize_column(df, column)
    
df = df.filter(like='_normalized')

df["predictions"] = best_rf.predict(df)
df['actual_weighted_score'] = df.mean(axis=1)

df

Unnamed: 0,temperature_normalized,production_normalized,price_normalized,predictions,actual_weighted_score
0,0.0,0.616553,0.224839,0.244837,0.271557
1,0.042947,1.0,0.162741,0.425609,0.407824
2,0.297433,0.182375,1.0,0.269432,0.43731
3,0.511454,0.081073,0.794433,0.262832,0.412448
4,0.747821,0.423607,0.956103,0.328542,0.614018
5,0.937377,0.120499,0.978587,0.428585,0.616262
6,1.0,0.398637,0.109208,0.325114,0.45824
7,0.990388,0.0,0.0,0.28695,0.319335
8,0.856253,0.063448,0.063169,0.241059,0.305982
9,0.567164,0.05292,0.388651,0.154374,0.290777


In [120]:
AvgWeighted = {}
for csv in ["Animal_Products.csv", "Corn.csv", "Fish.csv", "Fruit.csv", "Nuts.csv", "Soybeans.csv", "Vegetables.csv", "Wheat.csv"]:
    df=load_csv(csv)
    for column in df.columns[1:]:
        df = normalize_column(df, column)
    df = df.filter(like='_normalized')
    df['weighted_score'] = df.mean(axis=1)
    AvgWeighted[csv[:-4]] = df['weighted_score'].mean()
AvgWeighted

{'Animal_Products': 0.5892150655668456,
 'Corn': 0.42830679530894844,
 'Fish': 0.4565246793396595,
 'Fruit': 0.5408447482798284,
 'Nuts': 0.4809110728779785,
 'Soybeans': 0.4392538025775841,
 'Vegetables': 0.3721052483282629,
 'Wheat': 0.40048696565071135}