### Prerequisites for running notebook

In [1]:
!pip install kora -q

In [2]:
##Standard imports for project
import pandas as pd
import numpy as np
from datetime import datetime

#Functions for use
from sklearn.base import clone

#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

#Evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error

#Feature selection
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

#Hyperparameter optimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Required execution for notebook

The below verifies if the notebook is being executed in a local environment (Anaconda) or if the notebook is being hosted (Google Drive), and sets certain variables based on the requirement ("cwd" being the reference of the project directory; the notebook is always assumed to be executed at the root or the highest level of the project folder)

In [3]:
#Red pill or blue pill

from google.colab import drive
from kora import drive as drives
import os

is_drive = False
cwd = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/Datasets/"

while True:
  offon = input("Is this being run offline? (Y = offline (i.e. Jupyter notebook), N = online (i.e. Google Colab notebook)): ")
  try:
    if offon.lower() not in ["y", "n"]:
      raise ValueError
    else:
      if offon.lower() == "n":
        drive.mount('/content/drive')
        is_drive = True
        cwd = str(drives.chdir_notebook())
        cwd = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/Datasets/"
      break
  except ValueError:
    print("Error! Please only type one of the following: Y, y, N, n")

ModuleNotFoundError: No module named 'google.colab'

In [5]:
import os
cwd = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/Datasets/"

In [17]:
df1 = pd.read_csv(cwd+"milk dataset 1.csv")
df2 = pd.read_csv(cwd+"milk dataset 2.csv")
dfn = pd.read_csv(cwd+"netherlands dataset output.csv")

df1

In [18]:
df1

Unnamed: 0.1,Unnamed: 0,Year,Month,Average price of raw milk from Ireland (Euro per 100kg),Butter (Thousand tonnes),Calf nuts and cubes (16-18% protein) (Euro per Tonne),Cheese (Thousand tonnes),Dairy meal (16-18% protein) (Euro per Tonne),Dairy nuts and cubes (16-18% protein) (Euro per Tonne),Domestic milk intake (Million litres),Fat content (Percent),Imported milk intake (Million litres),Maize meal (Euro per Tonne),Skimmed & semi-skimmed milk sales (Million litres),Skimmed milk powder (Thousand tonnes),Whole milk sales (Million litres)
0,0,2014,1,42.34,4.4,329.0,2.3,300.0,302.0,132.0,4.11,38.8,252.0,16.9,0.0,23.5
1,1,2014,2,41.76,6.2,329.0,6.5,303.0,303.0,214.0,4.11,37.6,248.0,15.4,0.8,21.3
2,2,2014,3,39.04,14.4,305.0,18.4,290.0,288.0,470.7,4.08,40.8,244.0,18.7,3.9,25.4
3,3,2014,4,38.55,17.3,313.0,22.8,276.0,289.0,697.0,3.87,43.6,233.0,15.0,6.5,21.9
4,4,2014,5,37.10,21.7,315.0,24.8,284.0,292.0,785.5,3.75,51.1,240.0,17.4,11.9,24.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,91,2021,8,39.23,28.8,355.0,29.3,316.0,324.0,917.4,4.19,0.0,293.0,15.8,16.4,29.0
92,92,2021,9,42.44,26.5,360.0,33.2,323.0,332.0,776.7,4.43,0.0,299.0,15.1,11.1,26.6
93,93,2021,10,46.52,21.6,365.0,27.5,328.0,339.0,652.8,4.77,0.0,309.0,15.8,5.5,26.1
94,94,2021,11,48.65,17.8,370.0,20.9,333.0,344.0,460.6,4.90,0.0,314.0,15.4,0.0,25.5


In [19]:
df2

Unnamed: 0.1,Unnamed: 0,Year,Month,Average price of raw milk from Ireland (Euro per 100kg),Butter (Thousand tonnes),Calf nuts and cubes (16-18% protein) (Euro per Tonne),Cheese (Thousand tonnes),Dairy meal (16-18% protein) (Euro per Tonne),Dairy nuts and cubes (16-18% protein) (Euro per Tonne),Domestic milk intake (Million litres),Fat content (Percent),Imported milk intake (Million litres),Maize meal (Euro per Tonne),Skimmed & semi-skimmed milk sales (Million litres),Skimmed milk powder (Thousand tonnes),Whole milk sales (Million litres)
0,0,2007,1,28.30,3.8,0.0,1.3,0.0,0.0,123.0,3.87,54.6,0.0,12.3,2.4,32.1
1,1,2007,2,27.18,5.0,0.0,2.4,0.0,0.0,185.4,3.87,38.0,0.0,12.6,1.9,29.3
2,2,2007,3,25.64,10.1,0.0,10.2,0.0,0.0,386.5,3.80,35.8,0.0,12.8,5.0,32.4
3,3,2007,4,27.29,14.8,0.0,17.6,0.0,0.0,581.0,3.59,38.7,0.0,12.1,10.1,30.4
4,4,2007,5,29.76,19.1,0.0,19.3,0.0,0.0,688.3,3.62,46.9,0.0,12.8,13.2,31.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,175,2021,8,39.23,28.8,355.0,29.3,316.0,324.0,917.4,4.19,0.0,293.0,15.8,16.4,29.0
176,176,2021,9,42.44,26.5,360.0,33.2,323.0,332.0,776.7,4.43,0.0,299.0,15.1,11.1,26.6
177,177,2021,10,46.52,21.6,365.0,27.5,328.0,339.0,652.8,4.77,0.0,309.0,15.8,5.5,26.1
178,178,2021,11,48.65,17.8,370.0,20.9,333.0,344.0,460.6,4.90,0.0,314.0,15.4,0.0,25.5


In [20]:
dfn

Unnamed: 0.1,Unnamed: 0,Year,Month,Volume (Thousand tonnes),Fat content (Percent),Butter (Thousand tonnes),Cheese (Thousand tonnes),Skimmed-milk powder (Thousand tonnes),Concentrated milk (Thousand tonnes)
0,0,2011,1,996.942,4.56,12.130,65.490,6.402,28.106
1,1,2011,2,913.932,4.52,10.194,58.800,4.254,29.770
2,2,2011,3,1006.645,4.52,10.427,66.373,4.245,30.161
3,3,2011,4,1009.174,4.39,10.274,63.096,5.559,31.287
4,4,2011,5,1029.062,4.28,11.098,65.061,6.264,32.218
...,...,...,...,...,...,...,...,...,...
127,127,2021,8,1123.391,4.29,10.956,76.877,5.583,32.654
128,128,2021,9,1064.517,4.34,9.105,74.810,6.392,31.881
129,129,2021,10,1087.524,4.47,11.243,77.558,5.545,31.831
130,130,2021,11,1059.600,4.53,9.579,75.359,8.460,31.664


In [23]:
df1 = df1[df1.columns[1:]]
df2 = df2[df2.columns[1:]]
dfn = dfn[dfn.columns[1:]]

In [24]:
df1

Unnamed: 0,Year,Month,Average price of raw milk from Ireland (Euro per 100kg),Butter (Thousand tonnes),Calf nuts and cubes (16-18% protein) (Euro per Tonne),Cheese (Thousand tonnes),Dairy meal (16-18% protein) (Euro per Tonne),Dairy nuts and cubes (16-18% protein) (Euro per Tonne),Domestic milk intake (Million litres),Fat content (Percent),Imported milk intake (Million litres),Maize meal (Euro per Tonne),Skimmed & semi-skimmed milk sales (Million litres),Skimmed milk powder (Thousand tonnes),Whole milk sales (Million litres)
0,2014,1,42.34,4.4,329.0,2.3,300.0,302.0,132.0,4.11,38.8,252.0,16.9,0.0,23.5
1,2014,2,41.76,6.2,329.0,6.5,303.0,303.0,214.0,4.11,37.6,248.0,15.4,0.8,21.3
2,2014,3,39.04,14.4,305.0,18.4,290.0,288.0,470.7,4.08,40.8,244.0,18.7,3.9,25.4
3,2014,4,38.55,17.3,313.0,22.8,276.0,289.0,697.0,3.87,43.6,233.0,15.0,6.5,21.9
4,2014,5,37.10,21.7,315.0,24.8,284.0,292.0,785.5,3.75,51.1,240.0,17.4,11.9,24.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,2021,8,39.23,28.8,355.0,29.3,316.0,324.0,917.4,4.19,0.0,293.0,15.8,16.4,29.0
92,2021,9,42.44,26.5,360.0,33.2,323.0,332.0,776.7,4.43,0.0,299.0,15.1,11.1,26.6
93,2021,10,46.52,21.6,365.0,27.5,328.0,339.0,652.8,4.77,0.0,309.0,15.8,5.5,26.1
94,2021,11,48.65,17.8,370.0,20.9,333.0,344.0,460.6,4.90,0.0,314.0,15.4,0.0,25.5


In [25]:
df1.columns

Index(['Year', 'Month',
       'Average price of raw milk from Ireland (Euro per 100kg)',
       'Butter (Thousand tonnes)',
       'Calf nuts and cubes (16-18% protein) (Euro per Tonne)',
       'Cheese (Thousand tonnes)',
       'Dairy meal (16-18% protein) (Euro per Tonne)',
       'Dairy nuts and cubes (16-18% protein) (Euro per Tonne)',
       'Domestic milk intake (Million litres)', 'Fat content (Percent)',
       'Imported milk intake (Million litres)', 'Maize meal (Euro per Tonne)',
       'Skimmed & semi-skimmed milk sales (Million litres)',
       'Skimmed milk powder (Thousand tonnes)',
       'Whole milk sales (Million litres)'],
      dtype='object')

In [40]:
translations = {"Average price of raw milk from Ireland (Euro per 100kg)": "Raw milk price",
               "Butter (Thousand tonnes)": "Butter",
                "Cheese (Thousand tonnes)": "Cheese",
               "Calf nuts and cubes (16-18% protein) (Euro per Tonne)": "Calf nuts value",
               "Dairy meal (16-18% protein) (Euro per Tonne)": "Dairy meal value",
               "Dairy nuts and cubes (16-18% protein) (Euro per Tonne)": "Dairy nuts value",
               "Domestic milk intake (Million litres)": "Domestic milk intake",
               "Imported milk intake (Million litres)": "Imported milk intake",
               "Maize meal (Euro per Tonne)": "Maize meal value",
               "Skimmed & semi-skimmed milk sales (Million litres)": "Skimmed milk sales",
               "Skimmed milk powder (Thousand tonnes)": "Skimmed milk powder",
               "Whole milk sales (Million litres)": "Whole milk sales",
               "Skimmed-milk powder (Thousand tonnes)": "Skimmed milk powder",
               "Volume (Thousand tonnes)": "Milk production volume",
               "Concentrated milk (Thousand tonnes)": "Whole milk sales"}

In [41]:
df1 = df1.rename(translations, axis=1)

In [42]:
df1

Unnamed: 0,Year,Month,Raw milk price,Butter,Calf nuts value,Cheese,Dairy meal value,Dairy nuts value,Domestic milk intake,Fat content (Percent),Imported milk intake,Maize meal value,Skimmed milk sales,Skimmed milk powder,Whole milk sales
0,2014,1,42.34,4.4,329.0,2.3,300.0,302.0,132.0,4.11,38.8,252.0,16.9,0.0,23.5
1,2014,2,41.76,6.2,329.0,6.5,303.0,303.0,214.0,4.11,37.6,248.0,15.4,0.8,21.3
2,2014,3,39.04,14.4,305.0,18.4,290.0,288.0,470.7,4.08,40.8,244.0,18.7,3.9,25.4
3,2014,4,38.55,17.3,313.0,22.8,276.0,289.0,697.0,3.87,43.6,233.0,15.0,6.5,21.9
4,2014,5,37.10,21.7,315.0,24.8,284.0,292.0,785.5,3.75,51.1,240.0,17.4,11.9,24.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,2021,8,39.23,28.8,355.0,29.3,316.0,324.0,917.4,4.19,0.0,293.0,15.8,16.4,29.0
92,2021,9,42.44,26.5,360.0,33.2,323.0,332.0,776.7,4.43,0.0,299.0,15.1,11.1,26.6
93,2021,10,46.52,21.6,365.0,27.5,328.0,339.0,652.8,4.77,0.0,309.0,15.8,5.5,26.1
94,2021,11,48.65,17.8,370.0,20.9,333.0,344.0,460.6,4.90,0.0,314.0,15.4,0.0,25.5


In [43]:
df2 = df2.rename(translations, axis=1)
dfn = dfn.rename(translations, axis=1)

In [44]:
df2

Unnamed: 0,Year,Month,Raw milk price,Butter,Calf nuts value,Cheese,Dairy meal value,Dairy nuts value,Domestic milk intake,Fat content (Percent),Imported milk intake,Maize meal value,Skimmed milk sales,Skimmed milk powder,Whole milk sales
0,2007,1,28.30,3.8,0.0,1.3,0.0,0.0,123.0,3.87,54.6,0.0,12.3,2.4,32.1
1,2007,2,27.18,5.0,0.0,2.4,0.0,0.0,185.4,3.87,38.0,0.0,12.6,1.9,29.3
2,2007,3,25.64,10.1,0.0,10.2,0.0,0.0,386.5,3.80,35.8,0.0,12.8,5.0,32.4
3,2007,4,27.29,14.8,0.0,17.6,0.0,0.0,581.0,3.59,38.7,0.0,12.1,10.1,30.4
4,2007,5,29.76,19.1,0.0,19.3,0.0,0.0,688.3,3.62,46.9,0.0,12.8,13.2,31.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,2021,8,39.23,28.8,355.0,29.3,316.0,324.0,917.4,4.19,0.0,293.0,15.8,16.4,29.0
176,2021,9,42.44,26.5,360.0,33.2,323.0,332.0,776.7,4.43,0.0,299.0,15.1,11.1,26.6
177,2021,10,46.52,21.6,365.0,27.5,328.0,339.0,652.8,4.77,0.0,309.0,15.8,5.5,26.1
178,2021,11,48.65,17.8,370.0,20.9,333.0,344.0,460.6,4.90,0.0,314.0,15.4,0.0,25.5


In [45]:
dfn

Unnamed: 0,Year,Month,Milk production volume,Fat content (Percent),Butter,Cheese,Skimmed milk powder,Whole milk sales
0,2011,1,996.942,4.56,12.130,65.490,6.402,28.106
1,2011,2,913.932,4.52,10.194,58.800,4.254,29.770
2,2011,3,1006.645,4.52,10.427,66.373,4.245,30.161
3,2011,4,1009.174,4.39,10.274,63.096,5.559,31.287
4,2011,5,1029.062,4.28,11.098,65.061,6.264,32.218
...,...,...,...,...,...,...,...,...
127,2021,8,1123.391,4.29,10.956,76.877,5.583,32.654
128,2021,9,1064.517,4.34,9.105,74.810,6.392,31.881
129,2021,10,1087.524,4.47,11.243,77.558,5.545,31.831
130,2021,11,1059.600,4.53,9.579,75.359,8.460,31.664


# 

In [47]:
df1["Milk production volume"] = [x+y for (x,y) in zip(df1["Domestic milk intake"],
                                                     df1["Imported milk intake"])]

In [48]:
df1["Milk production volume"]

0     170.8
1     251.6
2     511.5
3     740.6
4     836.6
      ...  
91    917.4
92    776.7
93    652.8
94    460.6
95    256.3
Name: Milk production value, Length: 96, dtype: float64

In [49]:
df2["Milk production volume"] = [x+y for (x,y) in zip(df2["Domestic milk intake"],
                                                     df2["Imported milk intake"])]

In [51]:
dfn["Milk production volume"] = [x*.971164 for x in dfn["Milk production volume"]]

In [52]:
dfn["Milk production volume"]

0       968.194180
1       887.577857
2       977.617385
3       980.073459
4       999.387968
          ...     
127    1090.996897
128    1033.820588
129    1056.164158
130    1029.045374
131    1089.158484
Name: Milk production volume, Length: 132, dtype: float64