<a href="https://colab.research.google.com/github/carlosalzate74/14-ML-challenge/blob/master/data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold, cross_val_score
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

## Data Load

In [2]:
df = pd.read_csv("fish_participant.csv")
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,430.0,26.5,29.0,34.0,12.444,5.134
1,Perch,110.0,20.0,22.0,23.5,5.5225,3.995
2,Roach,160.0,20.5,22.5,25.3,7.0334,3.8203
3,Parkki,60.0,14.3,15.5,17.4,6.5772,2.3142
4,Bream,700.0,30.4,33.0,38.3,14.8604,5.2854


## Data Analysis

In [4]:
df.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,111.0,111.0,111.0,111.0,111.0,111.0
mean,401.676577,26.447748,28.615315,31.422523,9.015701,4.480407
std,338.510755,9.795155,10.498781,11.306311,4.225369,1.69624
min,5.9,7.5,8.4,8.8,1.7388,1.0476
25%,142.5,20.0,22.0,23.5,6.13885,3.5514
50%,300.0,25.4,27.5,30.1,8.1454,4.335
75%,682.5,33.75,36.25,40.15,12.1434,5.6583
max,1550.0,56.0,60.0,64.0,18.957,8.142


In [29]:
fig = make_subplots(rows=3, cols=3)

weight = go.Histogram(x=df["Weight"].values, name="Weight")
lenght1 = go.Histogram(x=df["Length1"].values, name="Lenght1")
lenght2 = go.Histogram(x=df["Length2"].values, name="Lenght2")
lenght3 = go.Histogram(x=df["Length3"].values, name="Lenght3")
lenght2 = go.Histogram(x=df["Height"].values, name="Height")
lenght3 = go.Histogram(x=df["Width"].values, name="Width")

fig.append_trace(weight, 1, 1)
fig.append_trace(lenght1, 1, 2)
fig.append_trace(lenght2, 2, 1)
fig.append_trace(lenght3, 2, 2)
fig.append_trace(lenght2, 3, 1)
fig.append_trace(lenght3, 3, 2)

fig.show()

In [0]:
p = 1.225
df = df.assign(Lmax=df[["Length1", "Length2", "Length3"]].max(axis=1))
df = df.assign(Mass=p * df["Height"] * df["Width"] * df["Lmax"])
df = df.assign(Volume=df["Height"] * df["Width"] * df["Lmax"])
df = df.assign(W=df["Weight"] * (df["Lmax"]))
df = df.round(decimals=3)

df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,Lmax,Mass,Volume,W
0,Bream,430.0,26.5,29.0,34.0,12.444,5.134,34.0,2660.914,2172.175,14620.0
1,Perch,110.0,20.0,22.0,23.5,5.522,3.995,23.5,635.121,518.466,2585.0
2,Roach,160.0,20.5,22.5,25.3,7.033,3.82,25.3,832.759,679.803,4048.0
3,Parkki,60.0,14.3,15.5,17.4,6.577,2.314,17.4,324.435,264.845,1044.0
4,Bream,700.0,30.4,33.0,38.3,14.86,5.285,38.3,3685.049,3008.203,26810.0


In [0]:
cor = df.corr()
cor

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Lmax,Mass,Volume,W
Weight,1.0,0.900963,0.90409,0.909553,0.738314,0.893368,0.909553,0.935133,0.935133,0.974951
Length1,0.900963,1.0,0.999468,0.991363,0.621647,0.862604,0.991363,0.789689,0.789689,0.888694
Length2,0.90409,0.999468,1.0,0.993695,0.637352,0.869211,0.993695,0.79888,0.79888,0.889827
Length3,0.909553,0.991363,0.993695,1.0,0.702566,0.872667,1.0,0.830369,0.830369,0.887565
Height,0.738314,0.621647,0.637352,0.702566,1.0,0.782941,0.702566,0.882972,0.882972,0.621674
Width,0.893368,0.862604,0.869211,0.872667,0.782941,1.0,0.872667,0.883331,0.883331,0.799377
Lmax,0.909553,0.991363,0.993695,1.0,0.702566,0.872667,1.0,0.830369,0.830369,0.887565
Mass,0.935133,0.789689,0.79888,0.830369,0.882972,0.883331,0.830369,1.0,1.0,0.867568
Volume,0.935133,0.789689,0.79888,0.830369,0.882972,0.883331,0.830369,1.0,1.0,0.867568
W,0.974951,0.888694,0.889827,0.887565,0.621674,0.799377,0.887565,0.867568,0.867568,1.0


In [0]:
cor_target = abs(cor["Weight"])
relevant_features = cor_target[cor_target>0.905]
relevant_features

Weight     1.000000
Length3    0.909553
Lmax       0.909553
Mass       0.935133
Volume     0.935133
W          0.974951
Name: Weight, dtype: float64

In [0]:
X = df[["Length3", "Lmax", "Mass", "Volume", "W"]].values
y = df["Weight"].values

X

In [0]:

fig = px.scatter(x=df["Weight"].values, y=df["Length3"].values)
fig.show()

In [0]:
fig = px.scatter(x=df["Weight"].values, y=df["Lmax"].values)
fig.show()

In [0]:
fig = px.scatter(x=df["Weight"].values, y=df["Mass"].values)
fig.show()

In [0]:
fig = px.scatter(x=df["Weight"].values, y=df["Volume"].values)
fig.show()

In [0]:
# Train

In [0]:
fig = px.scatter(x=df["Weight"].values, y=df["W"].values)
fig.show()

In [0]:
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    print(type(model).__name__ + " score: {:.4f}\n".format(rmse.mean()))

In [0]:
ETR = ExtraTreesRegressor()
score = rmsle_cv(ETR)

ExtraTreesRegressor score: 41.1508

