Predicting the probability of a lifespan of more than 11 years in abalone oysters with decision tree algorithm

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn import metrics
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [None]:
data = pd.read_csv('abalone.csv')
data = data.dropna()
print(data.shape)
data.head()

In [None]:
data['age'] = np.where(data['age'] <= 11, 0, data['age'])
data['age'] = np.where(data['age'] > 11, 1, data['age'])
data.head()

In [None]:
data['age'].value_counts()

In [None]:
sns.countplot(x='age', data=data, palette='hls')
plt.show()

In [None]:
data.groupby('age').mean()

In [None]:
cat_list = 'var_Sex'
cat_list = pd.get_dummies(data['Sex'], prefix='Sex')
data1 = data.join(cat_list)
data = data1

cat_vars = ['Sex']
data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]
data_final = data[to_keep]

In [17]:
cols = ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Sex_F', 'Sex_I', 'Sex_M']
x = data_final[cols]
y = data_final['age']
final_data = x.join(y)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3)

In [None]:
feat_labels = final_data.columns[:10]
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[f], importances[indices[f]]))

In [22]:
cols = ['Length', 'Diameter', 'Height']
x = data_final[cols]
y = data_final['age']
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3)

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
prfs = precision_recall_fscore_support(y_test, y_pred)

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print('Confusion matrix: \n', cm)
print('Precision: \n', prfs[0])
print('Recall: \n', prfs[1])
print('F-Score: \n', prfs[2])
print('Support: \n', prfs[3])