# Data Preparation  
Consisting of **Data Cleaning**, **Data Exploration**, and **Feature Engineering**.


## Part 1: Data Cleaning

In [20]:
# imports
import pandas as pd
import numpy as np

In [21]:
TRAIN_PATH = "cattle_data_train.csv"
TEST_PATH = "cattle_data_test.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

Train Shape: (210000, 36)
Test Shape: (40000, 35)


In [22]:
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210000 entries, 0 to 209999
Data columns (total 36 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Cattle_ID                210000 non-null  object 
 1   Breed                    210000 non-null  object 
 2   Climate_Zone             210000 non-null  object 
 3   Management_System        210000 non-null  object 
 4   Age_Months               210000 non-null  int64  
 5   Weight_kg                210000 non-null  float64
 6   Parity                   210000 non-null  int64  
 7   Lactation_Stage          210000 non-null  object 
 8   Days_in_Milk             210000 non-null  int64  
 9   Feed_Type                210000 non-null  object 
 10  Feed_Quantity_kg         199519 non-null  float64
 11  Feeding_Frequency        210000 non-null  int64  
 12  Water_Intake_L           210000 non-null  float64
 13  Walking_Distance_km      210000 non-null  float64
 14  Graz

Unnamed: 0,Cattle_ID,Breed,Climate_Zone,Management_System,Age_Months,Weight_kg,Parity,Lactation_Stage,Days_in_Milk,Feed_Type,...,BVD_Vaccine,Rabies_Vaccine,Previous_Week_Avg_Yield,Body_Condition_Score,Milking_Interval_hrs,Date,Farm_ID,Feed_Quantity_lb,Mastitis,Milk_Yield_L
0,CATTLE_133713,Holstein,Tropical,Intensive,114,544.8,4,Mid,62,Concentrates,...,0,1,6.31,3.0,12,2024-01-15,FARM_0301,36.8235,1,12.192634
1,CATTLE_027003,Holstein,Arid,Mixed,136,298.9,4,Mid,213,Crop_Residues,...,0,0,17.16,4.0,12,2023-10-31,FARM_0219,,0,14.717031
2,CATTLE_122459,Holstein,Tropical,Semi_Intensive,64,336.6,4,Late,16,Hay,...,1,0,4.07,3.5,12,2024-05-20,FARM_0802,16.0965,0,14.006142
3,CATTLE_213419,Jersey,Mediterranean,Intensive,58,370.5,1,Early,339,Crop_Residues,...,0,0,10.23,3.0,24,2024-07-22,FARM_0034,40.7925,0,24.324325
4,CATTLE_106260,Guernsey,Subtropical,Intensive,84,641.5,6,Early,125,Mixed_Feed,...,1,1,20.68,3.0,12,2023-01-03,FARM_0695,33.7365,1,12.023074


In [23]:
print("Missing values in training data")
print(train.isna().sum().sort_values(ascending=False).head(20))
print()
print("Missing values in testing data")
print(test.isna().sum().sort_values(ascending=False).head(20))

Missing values in training data
Feed_Quantity_kg           10481
Feed_Quantity_lb           10481
Housing_Score               6279
Cattle_ID                      0
Brucellosis_Vaccine            0
HS_Vaccine                     0
BQ_Vaccine                     0
Anthrax_Vaccine                0
IBR_Vaccine                    0
BVD_Vaccine                    0
Rabies_Vaccine                 0
Previous_Week_Avg_Yield        0
Body_Condition_Score           0
Milking_Interval_hrs           0
Date                           0
Farm_ID                        0
Mastitis                       0
FMD_Vaccine                    0
Humidity_percent               0
Breed                          0
dtype: int64

Missing values in testing data
Feed_Quantity_kg           2015
Feed_Quantity_lb           2015
Housing_Score              1221
FMD_Vaccine                   0
Brucellosis_Vaccine           0
HS_Vaccine                    0
BQ_Vaccine                    0
Anthrax_Vaccine               0
IBR_Vac

In [24]:
def cast_types(df):
    """ 
    Converts Date column to datetime format and casts categorical features
    Breed, Climate_Zone, Management_System, etc., to category dtypes    
    """
    df["Date"] = pd.to_datetime(df["Date"])
    categorical_cols = ["Breed", "Climate_Zone", "Management_System", "Lactation_Stage", "Feed_Type", "Farm_ID"]
    # will apply one hot encoding to these categories later
    for col in categorical_cols:
        df[col] = df[col].astype("category")
    return df

def unify_feed_quantity(df):
    pass

In [25]:
non_negatives = {"Age_Months": 0, "Weight_kg": 0, "Parity": 0, "Days_in_Milk": 0, "Feed_Quantity_kg": 0, "Water_Intake_L": 0, "Walking_Distance_km": 0, "Grazing_Duration_hrs": 0, "Rumination_Time_hrs": 0, "Resting_Hours": 0, "Humidity_percent": 0, "Previous_Week_Avg_Yield": 0, "Milking_Interval_hrs": 0, "Feed_Quantity_lb": 0, "Milk_Yield_L": 0}
for category in non_negatives:
    item = train[category]
    for i in item:
        if i < 0:
            non_negatives[category] += 1
print(non_negatives)

# oddly enough, it seems that Rumination_Time_hrs (Hours spent chewing cud per day)
# Milk_Yield_L (The total volume of milk produced by the cow in liters during the 
# recorded milking period; this is the target variable to be predicted.) seem to be
# negative a lot.

{'Age_Months': 0, 'Weight_kg': 0, 'Parity': 0, 'Days_in_Milk': 0, 'Feed_Quantity_kg': 0, 'Water_Intake_L': 0, 'Walking_Distance_km': 0, 'Grazing_Duration_hrs': 0, 'Rumination_Time_hrs': 115627, 'Resting_Hours': 0, 'Humidity_percent': 0, 'Previous_Week_Avg_Yield': 0, 'Milking_Interval_hrs': 0, 'Feed_Quantity_lb': 0, 'Milk_Yield_L': 74}
