In [1]:
# import libraries
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.preprocessing as sk_preprocessing
import sklearn.model_selection as sk_modelselect
from sklearn.tree import DecisionTreeClassifier # Decision tree classifier
from sklearn import metrics # scikit-learn metrics module for computing accuracy

import warnings
warnings.filterwarnings('ignore') # Ignore warning messages

In [2]:
# load dataset
dataset = pd.DataFrame(pd.read_csv("../data/dataset_1.csv", header=0))
display(dataset.head(10))

print(dataset.shape)

Unnamed: 0,Age,Mortality female,Mortality male,sex ratio,Total Eggs,Egg Weight,Feed female,Feed male,Hatching Eggs,total hatched eggs
0,26.0,5,2,10.29,6850,53.2,133.0,130,6405,3343.41
1,26.1,2,2,10.3,7630,53.4,133.0,130,7145,3729.69
2,26.2,2,1,10.312,8130,53.5,136.0,130,7630,3982.86
3,26.3,3,-,10.318,8540,53.6,136.0,130,7990,4170.78
4,26.4,2,-,10.316,9560,53.7,136.0,130,8915,4653.63
5,26.5,3,-,10.315,9490,53.8,140.0,130,8880,4635.36
6,26.6,3,1,10.313,9780,54.0,140.0,130,9180,4791.96
7,27.0,2,2,10.317,10450,54.1,140.0,130,9826,6317.135
8,27.1,2,2,10.329,10800,54.3,144.0,130,10110,6499.719
9,27.2,5,2,10.342,10860,54.5,144.0,130,10170,6538.293


(302, 10)


In [3]:
# check for missing data
print(dataset.isna().sum())

Age                   0
Mortality female      0
Mortality male        0
sex ratio             0
Total Eggs            0
Egg Weight            0
Feed female           0
Feed male             0
Hatching Eggs         0
total hatched eggs    0
dtype: int64


In [4]:
# Get columns with object or categorical dtype
categorical_cols = dataset.columns[(dataset.dtypes==object) | (dataset.dtypes==bool)].tolist()
print(categorical_cols)

['Mortality female', 'Mortality male']


In [5]:
# Loop through each categorical column and print what are the categoris they have (only done for test dataset)
for col in categorical_cols:
    categories = dataset[col].unique()
    print(f"{col}: {categories}")

Mortality female: ['5' '2' '3' '4' '6' '58' '67' '32' '8' '11' '17' '12' '10' '.' '7']
Mortality male: ['2' '1' '-' '75' '3' '4' '0']


In [6]:
dataset.replace('-', 0, inplace=True)
dataset.replace('.', 0, inplace=True)
display(dataset.head())

Unnamed: 0,Age,Mortality female,Mortality male,sex ratio,Total Eggs,Egg Weight,Feed female,Feed male,Hatching Eggs,total hatched eggs
0,26.0,5,2,10.29,6850,53.2,133.0,130,6405,3343.41
1,26.1,2,2,10.3,7630,53.4,133.0,130,7145,3729.69
2,26.2,2,1,10.312,8130,53.5,136.0,130,7630,3982.86
3,26.3,3,0,10.318,8540,53.6,136.0,130,7990,4170.78
4,26.4,2,0,10.316,9560,53.7,136.0,130,8915,4653.63


In [7]:
# file_path = '/home/new_dataset2.csv'

# # Use the to_csv() method to save the DataFrame as a CSV file
# dataset.to_csv(file_path, index=False)

In [8]:
# x_data = dataset.drop(columns=['total hatched eggs', 'Hatching Eggs', 'Feed female', 'Feed male'])
x_data = dataset.drop(['total hatched eggs', 'Hatching Eggs', 'Feed male'], axis=1)
y_data = dataset['total hatched eggs']
# x_data.reshape(-1, 1)


# Split the data into training and testing sets (20% testing and 80% training)
X_train, X_test, y_train, y_test = sk_modelselect.train_test_split(x_data, y_data, test_size=0.20, random_state=0)

display(X_train.head());
display(y_train.head());

Unnamed: 0,Age,Mortality female,Mortality male,sex ratio,Total Eggs,Egg Weight,Feed female
74,36.4,11,2,10.515,12680,62.1,166.0
153,47.6,3,0,10.894,10690,66.6,158.0
64,35.1,6,2,10.467,12990,61.1,168.0
295,68.1,5,3,11.924,6900,70.1,150.0
287,67.0,4,2,11.784,6870,69.8,152.0


74     10537.52
153     9145.15
64     10838.96
295     5270.00
287     5262.50
Name: total hatched eggs, dtype: float64

In [9]:
# Building decision tree classifier with the Gini index criterion
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(max_depth=10, random_state=0)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 123.45993306010926


In [10]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn import metrics

# for depth in range(5, 16):
#     regressor = DecisionTreeRegressor(max_depth=depth, random_state=0)
#     regressor.fit(X_train, y_train)

#     y_pred = regressor.predict(X_test)

#     mse = metrics.mean_absolute_error(y_test, y_pred)
#     print("Depth:", depth, "Mean Absolute Error:", mse)


In [11]:
# !pip install graphviz
# !pip install scikit-learn

# from sklearn import tree
# import graphviz

# dot_data = tree.export_graphviz(regressor, out_file=None, filled=True, rounded=True)

# graph = graphviz.Source(dot_data)
# graph.render("decision_tree")  # Save the visualization as a file
# graph.view()  # Open a window to display the visualization

In [12]:
predictions = regressor.predict(X_test)
print(predictions)

# # Step 5: Store predictions in a CSV file
# output_df = pd.DataFrame({'hatchedeggs': predictions})

# # Step 6: Save the CSV file
# # output_df.insert(0, 'actual', y_test)
# output_df.to_csv('predictions.csv', index=False)

# Create a DataFrame with predicted values and actual values
results_df = pd.DataFrame({'Predictions': predictions.flatten(), 'Actual': y_test})

# Save the DataFrame to a CSV file
results_df.to_csv('predictions_decisiontree.csv', index=False)

[ 6723.          9190.2         7138.          7493.8
 10967.68633333  6142.5         8102.4         8504.1
  7138.          7208.          9819.7         9271.725
  7208.          6911.175       8525.538      10838.96
 10196.4         9819.7         7830.6        10196.4
  8689.56        7428.75        6720.          5990.46
  9350.83666667  7005.6         5888.          5373.27875
  8361.4        10967.68633333  7830.6         5990.46
  9603.075       6538.293       8504.1         6322.68
  6240.78       10735.8         7428.75        8504.1
  4791.96       11177.719      10561.8         6090.4
  9975.095       9383.22        9819.7         7856.8
  5787.6         6538.293      10869.864       9603.075
  9730.6        10660.2         9730.6         6889.
  9383.22        4816.24        8839.38        9655.9
 10440.        ]
