In [3]:
%load_ext lab_black
%load_ext autoreload

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from pyprojroot import here  # nicked from Simon, cheers man!
import sys
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import variation
import scipy.stats as stats

# Machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

plt.style.use("seaborn")
plt.style.use("seaborn-poster")

sys.path.append(
    str(here())
)  # adds the project directory to a list of locations the python interpreter searches through when attempting to import modules

# Custom module imports
from src import df_cleaner
from src import plot_functions as pf
from src import plots

raw_data_path = here() / "data" / "abalone.csv"

In [10]:
df = pd.read_csv(raw_data_path)
df = df_cleaner.snake_case(df)
df = pd.get_dummies(df)
df.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,sex_F,sex_I,sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


## Making classes for rings

In [22]:
bin_labels = ["1", "2", "3", "4"]
df["quantile"] = pd.qcut(df["rings"], q=[0, 0.25, 0.5, 0.75, 1], labels=bin_labels)
df.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,sex_F,sex_I,sex_M,quantile
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1,4
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0,2
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1,3
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0,1


In [23]:
results, bin_edges = pd.qcut(
    df["rings"], q=[0, 0.25, 0.5, 0.75, 1], labels=bin_labels, retbins=True
)

results_table = pd.DataFrame(zip(bin_edges, bin_labels), columns=["Threshold", "Tier"])
results_table

Unnamed: 0,Threshold,Tier
0,1.0,1
1,8.0,2
2,9.0,3
3,11.0,4


# Predicting Groups

In [36]:
df.head(3)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,sex_F,sex_I,sex_M,quantile
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1,4
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0,2


In [37]:
target = "quantile"
cols = df.columns.to_list()
features = cols.copy()
features.remove(target)
features.remove("rings")  # as this gives an exact indication of the quantile

X = df[features].copy()
y = df[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [38]:
model = RandomForestClassifier()

In [39]:
def cross_val(model_name, model, X_train, y_train, cv):
    """Cross validate a model and gives scores and average score

    Keyword arguments:
    model_name -- string of the name, for printing out
    model -- model i.e. xgb, forest
    X_train -- data to use with no target
    y_train -- target
    cv -- number of cross validations
    """
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    print(f"{model_name} Scores:")
    for i in scores:
        print(round(i, 2))
    print(f"Average {model_name} score: {round(scores.mean(),2)}")

In [40]:
cross_val("Random Forest", model, X_train, y_train, 4)

Random Forest Scores:
0.58
0.55
0.55
0.58
Average Random Forest score: 0.57


In [41]:
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [42]:
result_df = X_test.copy()
result_df = result_df.merge(y_test, left_index=True, right_index=True)
result_df["Predicted Values"] = preds

In [43]:
result_df

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M,quantile,Predicted Values
866,0.605,0.455,0.160,1.1035,0.4210,0.3015,0.3250,0,0,1,2,4
1483,0.590,0.440,0.150,0.8725,0.3870,0.2150,0.2450,0,0,1,1,3
599,0.560,0.445,0.195,0.9810,0.3050,0.2245,0.3350,1,0,0,4,4
1702,0.635,0.490,0.170,1.2615,0.5385,0.2665,0.3800,1,0,0,2,3
670,0.475,0.385,0.145,0.6175,0.2350,0.1080,0.2150,0,0,1,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
601,0.385,0.315,0.110,0.2860,0.1225,0.0635,0.0835,1,0,0,3,1
233,0.275,0.215,0.075,0.1155,0.0485,0.0290,0.0350,0,1,0,1,1
2644,0.490,0.375,0.125,0.5445,0.2790,0.1150,0.1300,0,1,0,1,1
931,0.445,0.335,0.110,0.4355,0.2025,0.1095,0.1195,1,0,0,1,3
