In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd 
from joblib import dump 

In [11]:
heuristics = pd.read_csv('heuristics.csv')
seals_count = pd.read_csv('seals_count.csv', usecols = ['clump', 'Total Number of Seals'])

In [3]:
seals_count = seals_count.dropna() 
df = pd.merge(heuristics, seals_count, left_on = 'key', right_on = 'clump', how = 'inner')
df = df.drop(columns=['key', 'clump'])

In [4]:
df.head()

Unnamed: 0,width,height,avg_r,sd_r,avg_g,sd_g,avg_b,sd_b,Total Number of Seals
0,222,160,148.968468,8.861153,150.972917,10.472847,144.783333,35.448822,2.0
1,400,244,128.905,16.282485,133.304645,25.476204,125.685543,27.45159,8.0
2,324,314,129.433128,18.470493,148.5,10.585889,136.547643,32.810271,9.0
3,248,160,138.185484,11.64301,132.254167,17.457388,139.239592,29.105456,2.0
4,162,238,127.631687,9.237278,120.865546,9.84168,127.14724,25.30867,2.0


In [13]:
df.groupby('Total Number of Seals')['width'].sum().to_dict()

{0.0: 792,
 1.0: 6408,
 2.0: 77154,
 3.0: 24062,
 4.0: 25152,
 5.0: 9446,
 6.0: 5762,
 7.0: 1914,
 8.0: 2500,
 9.0: 1246,
 11.0: 300,
 13.0: 360}

In [14]:
X = df.drop(columns='Total Number of Seals')
y = df['Total Number of Seals']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=451)

In [15]:
rf_regressor = RandomForestRegressor(n_estimators=100, max_features=6, random_state=451)

rf_regressor.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")

mae_scores = -cross_val_score(rf_regressor, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f'Mean Absolute Error (Cross-Validation): {mae_scores.mean()}')

Mean Squared Error: 0.89
Mean Absolute Error: 0.58
Mean Absolute Error (Cross-Validation): 0.601788247213779


In [17]:
dump(rf_regressor, 'random_forest_mod1.joblib')

['random_forest_mod1.joblib']