## Coffee Production Prediction

Given *data about coffee*, let's try to predict the **average production** of coffee in a given country. 

We will use a random forest regression model to make our predictions. 

Data source: https://www.kaggle.com/datasets/yamaerenay/ico-coffee-dataset-worldwide

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

In [2]:
df_paths = [
    "domestic-consumption.csv",
    "exports-calendar-year.csv",
    "exports-crop-year.csv",
    "gross-opening-stocks.csv",
    "total-production.csv"
]

In [3]:
dfs = [pd.read_csv(df_path) for df_path in df_paths]

### Preprocessing

In [12]:
def get_means(df):
    df = df.copy()
    countries = df[df.columns[0]]
    means = df.mean(axis=1, numeric_only=True)
    df = pd.concat([countries, means], axis=1)
    df.columns = ['country', countries.name]
    return df

In [13]:
get_means(dfs[0])

Unnamed: 0,country,domestic_consumption
0,Angola,25.689655
1,Bolivia (Plurinational State of),41.103448
2,Brazil,15234.310345
3,Burundi,1.891966
4,Ecuador,214.137931
5,Indonesia,2662.137931
6,Madagascar,325.405724
7,Malawi,1.310345
8,Papua New Guinea,2.004828
9,Paraguay,19.482759


In [16]:
def make_df(dfs):
    # Process all DataFrames into one DataFrame
    processed_dfs = []
    for df in dfs:
        processed_dfs.append(get_means(df))
    
    # Merge DataFrames
    df = processed_dfs[0]

    for i in range(1, len(processed_dfs)):
        df = df.merge(processed_dfs[i], on='country')

    return df

In [17]:
data = make_df(dfs)
data

Unnamed: 0,country,domestic_consumption,exports,exports_crop_year,gross_opening_stocks,total_production
0,Angola,25.689655,24.115531,23.187083,42.068966,45.256048
1,Bolivia (Plurinational State of),41.103448,78.384152,76.904748,7.237931,116.146128
2,Brazil,15234.310345,25706.195606,25919.128803,23213.206897,41067.783976
3,Burundi,1.891966,363.186423,356.056321,48.97969,350.172424
4,Ecuador,214.137931,1115.800914,1105.43411,108.592069,1076.504352
5,Indonesia,2662.137931,5878.047357,5879.061059,690.114655,8452.302438
6,Madagascar,325.405724,289.048949,280.822603,87.587,587.719424
7,Malawi,1.310345,47.247465,46.095669,4.896552,46.716359
8,Papua New Guinea,2.004828,1015.455512,1015.019583,55.757928,1010.334755
9,Paraguay,19.482759,25.757542,20.850776,36.37931,34.678362


In [15]:
get_means(dfs[0]).merge(get_means(dfs[1]), on='country')

Unnamed: 0,country,domestic_consumption,exports
0,Angola,25.689655,24.115531
1,Bolivia (Plurinational State of),41.103448,78.384152
2,Brazil,15234.310345,25706.195606
3,Burundi,1.891966,363.186423
4,Ecuador,214.137931,1115.800914
5,Indonesia,2662.137931,5878.047357
6,Madagascar,325.405724,289.048949
7,Malawi,1.310345,47.247465
8,Papua New Guinea,2.004828,1015.455512
9,Paraguay,19.482759,25.757542


In [18]:
df = data.copy()

In [19]:
# Drop country column
df = df.drop("country", axis=1)

In [20]:
# Split df into X and y
y = df["total_production"]
X = df.drop('total_production', axis=1)

In [21]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [22]:
X_train

Unnamed: 0,domestic_consumption,exports,exports_crop_year,gross_opening_stocks
52,154.928966,3080.789345,3132.876862,1049.68031
35,4.978759,1.305947,1.318552,0.0
26,224.850655,2640.989406,2593.019476,1027.788241
45,39.275862,6.434295,6.366579,0.275862
54,1016.050759,13048.102878,13701.263769,1483.241414
27,200.0,438.641994,393.192238,34.143379
34,48.12069,225.830006,226.816866,26.413828
13,4.802276,78.112883,76.360955,18.070241
22,79.787897,795.942577,729.091645,181.943069
47,5.62069,50.098006,46.814148,14.310345


In [23]:
y_train

52     3211.943759
35        5.872314
26     2658.525303
45       45.642441
54    15156.584548
27      589.295686
34      274.420314
13       75.645990
22      770.235855
47       48.227941
30     4880.789417
17      389.662476
51       14.417234
31        1.539900
23      104.515079
4      1076.504352
14        3.847172
29        0.580517
28     1598.384334
50      161.011238
40      300.162076
18      501.272379
55      107.300966
20       43.185197
25     2016.203052
6       587.719424
7        46.716359
53     1121.038724
1       116.146128
16      508.823210
0        45.256048
15      201.634176
5      8452.302438
11      309.893407
9        34.678362
8      1010.334755
12       35.874466
43        0.856041
37     4552.609690
Name: total_production, dtype: float64

In [25]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [26]:
X_train

Unnamed: 0,domestic_consumption,exports,exports_crop_year,gross_opening_stocks
52,-0.288283,0.892275,0.8733,1.747068
35,-0.52158,-0.437099,-0.426357,-0.515553
26,-0.179496,0.702419,0.649249,1.699879
45,-0.46822,-0.434885,-0.424262,-0.514958
54,1.05148,5.195041,5.259387,2.681623
27,-0.21816,-0.248307,-0.263722,-0.441956
34,-0.454459,-0.340175,-0.332771,-0.458617
13,-0.521855,-0.403943,-0.395213,-0.476602
22,-0.40519,-0.094064,-0.124317,-0.123369
47,-0.520582,-0.416036,-0.407476,-0.484707


### Training/Results

In [27]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
print("Model trained.")

Model trained.


In [28]:
y_pred = model.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("RMSE: {:.2f}".format(rmse))

RMSE: 7423.55


In [29]:
np.sum((y_test - y_test.mean())**2)

1579133979.5953665

In [30]:
np.sum((y_test - y_pred)**2)

936854270.5261929

In [33]:
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

In [34]:
print("R^2: {:.5f}".format(r2))

R^2: 0.40673


In [35]:
model.score(X_test, y_test)

0.40672907895614396