In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
abalone_df = pd.read_csv('abalone.csv')
abalone_df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
abalone_df= pd.get_dummies(abalone_df)
abalone_df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


In [4]:
# standardization

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = abalone_df.drop('Rings', axis =1)
y = abalone_df['Rings']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

#Standardize
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)


In [5]:
# Create bins and bin labels for the Rings column

ring_bins = [0, 8, 10, 30]
ring_labels = ['under 9', '9-10', 'over 10']

# Bin the Rings column
# cut() returns a Pandas Series containing each of the binned column's values translated into their corresponding bins
#pd.cut(abalone_df["Rings"], ring_bins, labels=ring_labels)
# We can append our bins to abalone_df

abalone_df["Ring Range"] = pd.cut(abalone_df["Rings"], ring_bins, labels=ring_labels)
abalone_df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M,Ring Range
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1,over 10
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1,under 9
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0,9-10
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1,9-10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0,under 9


In [6]:
abalone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Length          4177 non-null   float64 
 1   Diameter        4177 non-null   float64 
 2   Height          4177 non-null   float64 
 3   Whole weight    4177 non-null   float64 
 4   Shucked weight  4177 non-null   float64 
 5   Viscera weight  4177 non-null   float64 
 6   Shell weight    4177 non-null   float64 
 7   Rings           4177 non-null   int64   
 8   Sex_F           4177 non-null   uint8   
 9   Sex_I           4177 non-null   uint8   
 10  Sex_M           4177 non-null   uint8   
 11  Ring Range      4177 non-null   category
dtypes: category(1), float64(7), int64(1), uint8(3)
memory usage: 277.6 KB


In [7]:
#nominal values
from sklearn.preprocessing import LabelEncoder

ring_labels = LabelEncoder()
abalone_df['Ring Range'] = ring_labels.fit_transform(abalone_df['Ring Range'].values)
abalone_df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M,Ring Range
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1,2
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0,2


In [8]:
abalone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Length          4177 non-null   float64
 1   Diameter        4177 non-null   float64
 2   Height          4177 non-null   float64
 3   Whole weight    4177 non-null   float64
 4   Shucked weight  4177 non-null   float64
 5   Viscera weight  4177 non-null   float64
 6   Shell weight    4177 non-null   float64
 7   Rings           4177 non-null   int64  
 8   Sex_F           4177 non-null   uint8  
 9   Sex_I           4177 non-null   uint8  
 10  Sex_M           4177 non-null   uint8  
 11  Ring Range      4177 non-null   int64  
dtypes: float64(7), int64(2), uint8(3)
memory usage: 306.1 KB


In [14]:
# import methods for initial decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

to_drop = abalone_df[['Rings', 'Ring Range']]
X = abalone_df.drop(to_drop, axis =1)
y = abalone_df['Ring Range']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)


# decision tree classifier
dtc = DecisionTreeClassifier(random_state=42)

dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

print('Baseline output')
print(classification_report(y_test, y_pred))

Baseline output
              precision    recall  f1-score   support

           0       0.43      0.42      0.42       403
           1       0.58      0.59      0.59       414
           2       0.70      0.71      0.70       437

    accuracy                           0.58      1254
   macro avg       0.57      0.57      0.57      1254
weighted avg       0.57      0.58      0.58      1254



#### Feature selection and tuning

In [15]:
from sklearn.model_selection import GridSearchCV

to_drop = abalone_df[['Rings', 'Ring Range']]
X = abalone_df.drop(to_drop, axis =1)
y = abalone_df['Ring Range']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)


# decision tree classifier
dtc = DecisionTreeClassifier(random_state=42)

dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

#feature selection
feature_imp = pd.Series(dtc.feature_importances_,index=abalone_df.drop(to_drop, axis =1).columns).sort_values(ascending=False)
print(feature_imp)
print(dtc.n_features_)

# define the grid of hyperparameters 'params_dt'
params_dt = {
    'max_depth': [2, 5, 8, 10],
    'min_samples_leaf': [4,8,10,12],
    'max_features': [1,2,3,4],
    'min_impurity_decrease': [0.0, 0.01,0.001, 0.0001]
}
# instantiate a 10-fold CV grid search object
grid_dt = GridSearchCV(estimator =dtc,
                      param_grid=params_dt,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)
# fit to the training data
grid_dt.fit(X_train, y_train)
#extract best hyperparameters
best_hyperparams = grid_dt.best_params_
print('Best hyperparameters: ', best_hyperparams)

Shell weight      0.374844
Shucked weight    0.149362
Viscera weight    0.110080
Whole weight      0.098142
Length            0.083516
Height            0.069092
Diameter          0.064647
Sex_I             0.030250
Sex_M             0.011817
Sex_F             0.008250
dtype: float64
10
Best hyperparameters:  {'max_depth': 10, 'max_features': 3, 'min_impurity_decrease': 0.001, 'min_samples_leaf': 10}


In [16]:
# extract best model
best_model = grid_dt.best_estimator_
# fit best model
best_model.fit(X_train,  y_train)
y_pred = best_model.predict(X_test)

print('Best fit:\n', classification_report(y_test, y_pred))

Best fit:
               precision    recall  f1-score   support

           0       0.48      0.38      0.42       403
           1       0.56      0.70      0.62       414
           2       0.75      0.72      0.74       437

    accuracy                           0.60      1254
   macro avg       0.60      0.60      0.59      1254
weighted avg       0.60      0.60      0.60      1254



In [17]:

X = abalone_df[['Shell weight', 'Shucked weight','Viscera weight']]
y = abalone_df['Ring Range']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)


# decision tree classifier
dtc = DecisionTreeClassifier(random_state=42)

# define the grid of hyperparameters 'params_dt'
params_dt = {
    'max_depth': [2, 5, 8, 10],
    'min_samples_leaf': [4,8,10,12],
    'max_features': [1,2,3,4],
    'min_impurity_decrease': [0.0, 0.01,0.001, 0.0001]
}
# instantiate a 10-fold CV grid search object
grid_dt = GridSearchCV(estimator =dtc,
                      param_grid=params_dt,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)
# fit to the training data
grid_dt.fit(X_train, y_train)
#extract best hyperparameters
best_hyperparams = grid_dt.best_params_
print('Best hyperparameters: ', best_hyperparams)

Best hyperparameters:  {'max_depth': 5, 'max_features': 3, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4}


 0.57340923 0.57340923 0.57340923 0.57340923 0.57340923 0.57340923
 0.57340923 0.57340923 0.57340923 0.57340923 0.58672682 0.58672682
 0.58672682 0.58672682 0.58672682 0.58672682 0.58672682 0.58672682
 0.58672682 0.58672682 0.58672682 0.58672682 0.58672682 0.58672682
 0.58672682 0.58672682 0.59973117 0.59973117 0.59973117 0.59973117
 0.59973117 0.59973117 0.59973117 0.59973117 0.59973117 0.59973117
 0.59973117 0.59973117 0.59973117 0.59973117 0.59973117 0.59973117
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan 0.60486582 0.60589322
 0.60075623 0.61751601 0.58228061 0.58228061 0.58228061 0.58228061
 0.59356094 0.59492496 0.59458366 0.59424237 0.60486582 0.60589322
 0.60075623 0.61717355 0.60794684 0.61478213 0.60896138 0.61342513
 0.57644701 0.57644701 0.57644701 0.57644701 0.61273669 0.61615083
 0.6123989  0.61718991 0.60520712 0.61478213 0.60896138 0.6134

In [18]:
# extract best model
best_model = grid_dt.best_estimator_
# fit best model
best_model.fit(X_train,  y_train)
y_pred = best_model.predict(X_test)

print('Best fit:\n', classification_report(y_test, y_pred))

Best fit:
               precision    recall  f1-score   support

           0       0.48      0.49      0.49       403
           1       0.59      0.61      0.60       414
           2       0.77      0.73      0.75       437

    accuracy                           0.61      1254
   macro avg       0.61      0.61      0.61      1254
weighted avg       0.62      0.61      0.61      1254



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

to_drop = abalone_df[['Rings', 'Ring Range']]
X = abalone_df.drop(to_drop, axis =1)
y = abalone_df['Ring Range']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=42)

#fit model no training data
model = XGBClassifier(use_label_encoder=False)
model.fit(X_train, y_train)




#### My kernel dies every time I try to run xgboost :( 

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

to_drop = abalone_df[['Rings', 'Ring Range']]
X = abalone_df.drop(to_drop, axis =1)
y = abalone_df['Ring Range']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# instantiate gradientboostingclassifier

sgbt = GradientBoostingClassifier(max_depth=1,
                                 subsample=0.8,
                                 max_features=0.2,
                                 n_estimators=300,
                                 random_state=42)
sgbt.fit(X_train, y_train)
y_pred = sgbt.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.44      0.47       403
           1       0.65      0.69      0.67       414
           2       0.75      0.78      0.77       437

    accuracy                           0.64      1254
   macro avg       0.63      0.64      0.63      1254
weighted avg       0.64      0.64      0.64      1254



In [10]:

to_drop = abalone_df[['Rings', 'Ring Range']]
X = abalone_df.drop(to_drop, axis =1)
y = abalone_df['Ring Range']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# instantiate gradientboostingclassifier

sgbt = GradientBoostingClassifier(max_depth=1,
                                 max_features=0.2,
                                 n_estimators=300,
                                 random_state=42)
sgbt.fit(X_train, y_train)
y_pred = sgbt.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.44      0.47       403
           1       0.64      0.68      0.66       414
           2       0.74      0.78      0.76       437

    accuracy                           0.64      1254
   macro avg       0.63      0.64      0.63      1254
weighted avg       0.63      0.64      0.63      1254

