In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Original and output data folders
original = Path("../original/")
output = Path("../output/")

# Duplicate row by changing managementAreaId column with the unique one

In [3]:
# Farms Ids
MilkMeasurementsPlusTarget = pd.read_csv(
    output / "MilkMeasurementsPlusGroupedTarget.csv"
)
ManagementAreaIds = MilkMeasurementsPlusTarget["ManagementAreaId"].unique()
ManagementAreaIds

array([ 750, 1524, 1894, 1867, 1892, 1893, 1898, 1901, 1903, 1902, 1905,
       1906, 1908, 1907, 1909, 1910, 1911, 1912, 1917], dtype=int64)

In [4]:
# Keep animal and milk parameters columns
MilkMeasurementsPlusTargetAnimals = MilkMeasurementsPlusTarget[
    ["ManagementAreaId", "Breed", "Species"]
].drop_duplicates()
MilkMeasurementsPlusTarget = MilkMeasurementsPlusTarget.drop(
    columns=["Breed", "Species", "Target"]
)

In [5]:
# Create an empty DataFrame to store the results
MilkMeasurementsDuplicatedSamples = pd.DataFrame()

# Iterate through the rows and duplicate each measurement for all of the farms
for _, row in MilkMeasurementsPlusTarget.iterrows():
    for i, ManagementAreaId in enumerate(ManagementAreaIds):
        new_row = row.copy()
        new_row["Id"] = f'{new_row["Id"]}-{i}'
        if new_row["ManagementAreaId"] != ManagementAreaId:
            new_row["ManagementAreaId"] = ManagementAreaId
        MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples._append(
            new_row, ignore_index=True
        )

# Display the result
MilkMeasurementsDuplicatedSamples

Unnamed: 0,Id,ManagementAreaId,SampleDate,Fat,Protein,ES,EQ,Bacteria,SomaticCellCount,Urea,FreezingPoint
0,1485-0,750,2022-05-14,4.07,3.68,7.75,7.75,0,1217,431.0,0.568
1,1485-1,1524,2022-05-14,4.07,3.68,7.75,7.75,0,1217,431.0,0.568
2,1485-2,1894,2022-05-14,4.07,3.68,7.75,7.75,0,1217,431.0,0.568
3,1485-3,1867,2022-05-14,4.07,3.68,7.75,7.75,0,1217,431.0,0.568
4,1485-4,1892,2022-05-14,4.07,3.68,7.75,7.75,0,1217,431.0,0.568
...,...,...,...,...,...,...,...,...,...,...,...
52454,4727-14,1909,2023-11-08,5.10,4.19,14.57,9.29,343,1667,591.0,0.560
52455,4727-15,1910,2023-11-08,5.10,4.19,14.57,9.29,343,1667,591.0,0.560
52456,4727-16,1911,2023-11-08,5.10,4.19,14.57,9.29,343,1667,591.0,0.560
52457,4727-17,1912,2023-11-08,5.10,4.19,14.57,9.29,343,1667,591.0,0.560


In [6]:
# Save
MilkMeasurementsDuplicatedSamples.to_csv(
    output / "MilkMeasurementsDuplicatedSamples.csv", index=False
)

# Melt milk measurements and merge with measurement types

In [7]:
# Milk characteristics labels in Spanish
MeasurementType = pd.read_excel(original / "MeasurementType.xlsx")

# Milk characteristics in English to match MilkMeasurements column names
MeasurementType["Measurement"] = [
    "Quantity",
    "EQ",
    "SomaticCellCount",
    "Fat",
    "Protein",
    "Bacteria",
    "Urea",
    "ES",
    "Casein",
    "Lactose",
    "FreezingPoint",
    "NumberOfAnimals",
]

# Drop quantity and number of animals because their values are unavailable, casein and lactose because they're unused in the recommandation algorithm
MeasurementType = MeasurementType[
    ~MeasurementType["Measurement"].isin(
        ["Quantity", "NumberOfAnimals", "Casein", "Lactose"]
    )
]

# Rename Id to MeasurementTypeId to match other tables' columns
MeasurementType.rename(columns={"Id": "MeasurementTypeId"}, inplace=True)

# Drop Spanish measurement names
MeasurementType.drop(columns="Name", inplace=True)

# Overview
MeasurementType

Unnamed: 0,MeasurementTypeId,Measurement
1,2,EQ
2,3,SomaticCellCount
3,4,Fat
4,5,Protein
5,6,Bacteria
6,7,Urea
7,8,ES
10,11,FreezingPoint


In [8]:
# Melt MilkMeasurements to pivot MilkMeasurement columns into rows
measurement_types = list(MeasurementType["Measurement"])
id_vars = ["Id", "ManagementAreaId"]
MeltedMilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples.melt(
    id_vars=id_vars,
    var_name="Measurement",
    value_vars=measurement_types,
    value_name="Value",
)

# Merging with MeasurementType to get MeasurementTypeId
MeltedMilkMeasurementsDuplicatedSamples = pd.merge(
    MeltedMilkMeasurementsDuplicatedSamples, MeasurementType, on="Measurement"
)

# Overview
MeltedMilkMeasurementsDuplicatedSamples.head(10)

Unnamed: 0,Id,ManagementAreaId,Measurement,Value,MeasurementTypeId
0,1485-0,750,EQ,7.75,2
1,1485-1,1524,EQ,7.75,2
2,1485-2,1894,EQ,7.75,2
3,1485-3,1867,EQ,7.75,2
4,1485-4,1892,EQ,7.75,2
5,1485-5,1893,EQ,7.75,2
6,1485-6,1898,EQ,7.75,2
7,1485-7,1901,EQ,7.75,2
8,1485-8,1903,EQ,7.75,2
9,1485-9,1902,EQ,7.75,2


# Measurement ranges

In [9]:
# A reference table to determine whether a collected sample needs an intervention (botanic pill) or not depending on the farm
MeasurementRange = pd.read_excel(original / "MeasurementRange.xlsx")
MeasurementRange.head()

Unnamed: 0,Id,ManagementAreaId,TargetValue,DangerZoneLow,LowValue,HighValue,DangerZoneHigh,Rank,MeasurementTypeId
0,1029,750,1.5,0.52,1.2,1.8,2.48,10,1
1,1031,1917,200.0,40.0,100.0,250.0,600.0,10,3
2,949,1898,850.0,170.0,425.0,1062.5,2550.0,8,3
3,950,1898,4.9,2.94,4.41,5.39,6.86,7,4
4,1032,1917,250.0,200.0,225.0,275.0,300.0,9,7


In [10]:
# Divide boundary values by 1000 for FreezingPoint
columns_to_divide = [
    "TargetValue",
    "DangerZoneLow",
    "LowValue",
    "HighValue",
    "DangerZoneHigh",
]
MeasurementRange.loc[MeasurementRange["MeasurementTypeId"] == 11, columns_to_divide] = (
    MeasurementRange.loc[MeasurementRange["MeasurementTypeId"] == 11, columns_to_divide]
    / 1000
)

# Merge measurement ranges with (melted) milk measurements and assign the corresponding trigger value

In [11]:
# Same number of Ids
print(f"MilkMeasurements: {len(MilkMeasurementsDuplicatedSamples)} rows")
print(
    f"MeltedMilkMeasurements (initial state): {len(MeltedMilkMeasurementsDuplicatedSamples['Id'].unique())} unique ids"
)

# Merging measurements with ranges for each farm -> some records are lost because of no matches in MeasurementRange
MeltedMilkMeasurementsDuplicatedSamples = pd.merge(
    MeltedMilkMeasurementsDuplicatedSamples,
    MeasurementRange.drop(columns="Id"),
    on=["ManagementAreaId", "MeasurementTypeId"],
)
print(
    f"MeltedMilkMeasurements (after merging): {len(MeltedMilkMeasurementsDuplicatedSamples['Id'].unique())} unique ids"
)

# Parameters with value = 0 are not taken into account in the recommandation system
MeltedMilkMeasurementsDuplicatedSamples = MeltedMilkMeasurementsDuplicatedSamples[
    MeltedMilkMeasurementsDuplicatedSamples["Value"] != 0
]
print(
    f"MeltedMilkMeasurements (after filtering): {len(MeltedMilkMeasurementsDuplicatedSamples['Id'].unique())} unique ids"
)

MilkMeasurements: 52459 rows
MeltedMilkMeasurements (initial state): 52459 unique ids
MeltedMilkMeasurements (after merging): 52459 unique ids
MeltedMilkMeasurements (after filtering): 52459 unique ids


In [12]:
# Assign the corresponding trigger value
# - Adequate: nothing to do, everythoing is okay
# - Low / High: recommend a botanic pill (BP)
# - Danger Zone Low / High: notification -> a human needs to look more carefully
def assign_TriggerValue(Value, DangerZoneLow, LowValue, HighValue, DangerZoneHigh):
    if Value < DangerZoneLow:
        return "Danger Zone Low"
    elif Value < LowValue:
        return "Low"
    elif Value <= HighValue:
        return "Adequate"
    elif Value < DangerZoneHigh:
        return "High"
    elif Value >= DangerZoneHigh:
        return "Danger Zone High"


MeltedMilkMeasurementsDuplicatedSamples["TriggerValue"] = (
    MeltedMilkMeasurementsDuplicatedSamples.apply(
        lambda row: assign_TriggerValue(
            row["Value"],
            row["DangerZoneLow"],
            row["LowValue"],
            row["HighValue"],
            row["DangerZoneHigh"],
        ),
        axis=1,
    )
)

In [13]:
# Select useful columns and sort the dataset
MeltedMilkMeasurementsDuplicatedSamples = MeltedMilkMeasurementsDuplicatedSamples[
    [
        "Id",
        "ManagementAreaId",
        "MeasurementTypeId",
        "Measurement",
        "Value",
        "Rank",
        "TriggerValue",
    ]
]
MeltedMilkMeasurementsDuplicatedSamples = (
    MeltedMilkMeasurementsDuplicatedSamples.sort_values(
        ["Id", "MeasurementTypeId"], ignore_index=True
    )
)
MeltedMilkMeasurementsDuplicatedSamples.head()

Unnamed: 0,Id,ManagementAreaId,MeasurementTypeId,Measurement,Value,Rank,TriggerValue
0,1401-0,750,3,SomaticCellCount,1273.0,9,High
1,1401-0,750,4,Fat,4.47,7,Low
2,1401-0,750,8,ES,8.28,8,Low
3,1401-1,1524,2,EQ,8.28,9,Low
4,1401-1,1524,3,SomaticCellCount,1273.0,10,Adequate


# Export

In [14]:
MeltedMilkMeasurementsDuplicatedSamples.to_csv(
    output / "MeltedMilkMeasurementsDuplicatedSamples.csv", index=False
)

# Data labelling

In [15]:
# Based on the set of trigger values related to a milk measurement, we are going to assign different labels to our data
MilkMeasurementsDuplicatedSamples["Target"] = None

# Data to assign labels to
MeasurementIds = MeltedMilkMeasurementsDuplicatedSamples["Id"].unique()
MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples[
    MilkMeasurementsDuplicatedSamples["Id"].isin(MeasurementIds)
]

# Set of trigger values per id
TriggerValuesSet = MeltedMilkMeasurementsDuplicatedSamples.groupby("Id").agg(
    {"TriggerValue": set}
)
print(TriggerValuesSet.sample(5))

                              TriggerValue
Id                                        
4274-16       {Adequate, Danger Zone High}
2401-10  {High, Adequate, Danger Zone Low}
2209-0                    {High, Adequate}
2657-17              {Low, High, Adequate}
4721-16           {High, Danger Zone High}


## 1 No treatment if all params are OK

In [16]:
NoTreatmentIds = TriggerValuesSet[
    TriggerValuesSet["TriggerValue"] == {"Adequate"}
].index
MilkMeasurementsDuplicatedSamples.loc[
    MilkMeasurementsDuplicatedSamples["Id"].isin(NoTreatmentIds), "Target"
] = "No treatment"

## 2 Notif if at least one param is in danger zone

In [17]:
NotifIds = TriggerValuesSet[
    TriggerValuesSet["TriggerValue"].apply(
        lambda TriggerValues: sum(
            ["Danger" in TriggerValue for TriggerValue in TriggerValues]
        )
        > 0
    )
].index
MilkMeasurementsDuplicatedSamples.loc[
    MilkMeasurementsDuplicatedSamples["Id"].isin(NotifIds), "Target"
] = "Notif"

## 3 Find the right BP if no param in danger zone and at least one inadequate param

### Supplement matrix value: Botanic pill (BP) recommandations

In [18]:
# Load BP recommandations table
SupplementMatrixValue = pd.read_excel(original / "SupplementMatrixValue.xlsx")

In [19]:
# Inadequate values Ids
InadequateIds = {*MeasurementIds}.difference(list(NoTreatmentIds) + list(NotifIds))

# All botanic pills for each inadequate param
BotanicPillsList = (
    MeltedMilkMeasurementsDuplicatedSamples[
        MeltedMilkMeasurementsDuplicatedSamples["Id"].isin(InadequateIds)
    ]
    .drop(columns=["Value", "Rank"])
    .merge(
        SupplementMatrixValue.drop(columns="Id"),
        on=["MeasurementTypeId", "TriggerValue"],
    )
    .sort_values(["Id", "MeasurementTypeId", "SupplementId"], ignore_index=True)
)

# Sum of ranks per sample or measurement
BotanicPillsSummedList = (
    BotanicPillsList.groupby(["Id", "SupplementId"])["Rank"]
    .sum()
    .reset_index()
    .rename(columns={"Rank": "RankSum"})
)

# Maximum sum of ranks (it is possible to have many BPs for one sample in case of multiple maximums)
BotanicPillsRank = BotanicPillsSummedList.groupby("Id")["RankSum"].max().reset_index()

In [20]:
# Recommended BP (or list of)
BotanicPills = BotanicPillsSummedList.merge(
    BotanicPillsRank, on=["Id", "RankSum"]
).rename(columns={"SupplementId": "Target"})
BotanicPills.head()

Unnamed: 0,Id,Target,RankSum
0,1401-0,1,18
1,1401-1,1,28
2,1401-11,38,17
3,1401-11,42,17
4,1401-12,38,22


In [21]:
# Is there a unique recommendation or not?
BotanicPillsCount = BotanicPills.groupby("Id").size()

In [22]:
# Assign the selected BP when it's unique
UniqueBPIds = BotanicPillsCount[BotanicPillsCount == 1].index

# Merge data, update subset values, and drop unnecessary columns
MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples.merge(
    BotanicPills.loc[BotanicPills["Id"].isin(UniqueBPIds), ["Id", "Target"]],
    on="Id",
    how="left",
    suffixes=("_original", "_updated"),
)
MilkMeasurementsDuplicatedSamples["Target"] = MilkMeasurementsDuplicatedSamples[
    "Target_updated"
].fillna(MilkMeasurementsDuplicatedSamples["Target_original"])
MilkMeasurementsDuplicatedSamples.drop(
    columns=["Target_original", "Target_updated"], inplace=True
)

In [23]:
# Else, ... There are many maximums when adding up the ranks
ManyBPIds = BotanicPillsCount[BotanicPillsCount >= 2].index

In [24]:
# We must find the number of params included in the sum
NumberOfParams = (
    BotanicPillsList.loc[BotanicPillsList["Id"].isin(ManyBPIds)]
    .rename(columns={"SupplementId": "Target"})
    .merge(BotanicPills, on=["Id", "Target"])
    .groupby(["Id", "Target"])
    .size()
    .reset_index()
    .rename(columns={0: "NumberOfParams"})
)

# The choosen BP is the one that cures the most params (there may be several maximums with the same number of params)
MaxNumberOfParams = NumberOfParams.groupby("Id")["NumberOfParams"].max().reset_index()
MaxNumberOfParams = NumberOfParams.merge(MaxNumberOfParams, on=["Id", "NumberOfParams"])

# Number of distinct BPs
DistinctMaxNumberOfParams = MaxNumberOfParams.groupby("Id").size()

In [25]:
# If the previous table contains only one record per id, then there's a BP that curates strictly more params than the others; this is the one chosen
UniqueMaxIds = DistinctMaxNumberOfParams[
    DistinctMaxNumberOfParams == 1
].index.sort_values()

# Merge data, update subset values, and drop unnecessary columns
MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples.merge(
    BotanicPills.loc[BotanicPills["Id"].isin(UniqueMaxIds), ["Id", "Target"]],
    on="Id",
    how="left",
    suffixes=("_original", "_updated"),
)
MilkMeasurementsDuplicatedSamples["Target"] = MilkMeasurementsDuplicatedSamples[
    "Target_updated"
].fillna(MilkMeasurementsDuplicatedSamples["Target_original"])
MilkMeasurementsDuplicatedSamples.drop(
    columns=["Target_original", "Target_updated"], inplace=True
)

In [26]:
# If there are many top BPs, add all
ManyMaxIds = DistinctMaxNumberOfParams[
    DistinctMaxNumberOfParams > 1
].index.sort_values()

# Merge data, update subset values, and drop unnecessary columns
MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples.merge(
    BotanicPills.loc[BotanicPills["Id"].isin(ManyMaxIds), ["Id", "Target"]]
    .groupby("Id")["Target"]
    .apply(list)
    .reset_index(),
    on="Id",
    how="left",
    suffixes=("_original", "_updated"),
)
MilkMeasurementsDuplicatedSamples["Target"] = MilkMeasurementsDuplicatedSamples[
    "Target_updated"
].fillna(MilkMeasurementsDuplicatedSamples["Target_original"])
MilkMeasurementsDuplicatedSamples.drop(
    columns=["Target_original", "Target_updated"], inplace=True
)

# Export final dataset (with target)

In [27]:
# Add animal info
MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples.merge(
    MilkMeasurementsPlusTargetAnimals, on="ManagementAreaId"
)

In [28]:
# Columns order
MilkMeasurementsDuplicatedSamples = MilkMeasurementsDuplicatedSamples[
    [
        "Id",
        "ManagementAreaId",
        "SampleDate",
        "EQ",
        "SomaticCellCount",
        "Fat",
        "Protein",
        "Bacteria",
        "Urea",
        "ES",
        "FreezingPoint",
        "Breed",
        "Species",
        "Target",
    ]
]

In [29]:
MilkMeasurementsDuplicatedSamples.to_csv(
    output / "MilkMeasurementsPlusTargetDuplicateSample.csv", index=False
)