# CS 235 - Pet Adoption Speed - Random Forest Model

# Preprocessing
This preprocessing was the same done in the group, so I used Ashley Pang's preprocessing code since the decision trees are the most similar to my model.

In [1]:
import pandas as pd

# read everything in
df_train = pd.read_csv('petfinder-adoption-prediction/train/train.csv')
df_breedLabels = pd.read_csv("petfinder-adoption-prediction/PetFinder-BreedLabels.csv")
df_colorLabels = pd.read_csv("petfinder-adoption-prediction/PetFinder-ColorLabels.csv")
df_stateLabels = pd.read_csv("petfinder-adoption-prediction/PetFinder-StateLabels.csv")
# df_test = pd.read_csv('petfinder-adoption-prediction/test/test.csv')

# import pandas as pd
# from google.colab import drive

# # read everything in
# drive.mount('/content/drive/', force_remount=True)
# df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/petfinder-adoption-prediction/train/train.csv')
# df_breedLabels = pd.read_csv('/content/drive/My Drive/Colab Notebooks/petfinder-adoption-prediction/PetFinder-BreedLabels.csv')
# df_colorLabels = pd.read_csv('/content/drive/My Drive/Colab Notebooks/petfinder-adoption-prediction/PetFinder-ColorLabels.csv')
# df_stateLabels = pd.read_csv('/content/drive/My Drive/Colab Notebooks/petfinder-adoption-prediction/PetFinder-StateLabels.csv')
# # df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/petfinder-adoption-prediction/test/test.csv')

print(df_train)

       Type            Name  Age  Breed1  Breed2  Gender  Color1  Color2  \
0         2          Nibble    3     299       0       1       1       7   
1         2     No Name Yet    1     265       0       1       1       2   
2         1          Brisco    1     307       0       1       2       7   
3         1            Miko    4     307       0       2       1       2   
4         1          Hunter    1     307       0       1       1       0   
...     ...             ...  ...     ...     ...     ...     ...     ...   
14988     2             NaN    2     266       0       3       1       0   
14989     2  Serato & Eddie   60     265     264       3       1       4   
14990     2         Monkies    2     265     266       3       5       6   
14991     2         Ms Daym    9     266       0       2       4       7   
14992     1            Fili    1     307     307       1       2       0   

       Color3  MaturitySize  ...  Health  Quantity  Fee  State  \
0           0        

In [2]:
# preprocessing
import numpy as np
from scipy import stats

df_train = df_train.drop(['Name', 'RescuerID', 'PetID', 'Description'], axis=1)
df_breedLables = df_breedLabels.drop(['Type'], axis=1)

# Drop rows with missing information
df_train = df_train.dropna()

# Calculate z-score for only numerical columns
z_scores_fee = np.abs(stats.zscore(df_train['Fee']))
z_scores_age = np.abs(stats.zscore(df_train['Age']))

# Using a threshold of 5 std dev. identify the outlier rows
outlier_rows_fee = z_scores_fee > 5
outlier_rows_age = z_scores_age > 5
combined_outlier_rows = outlier_rows_fee | outlier_rows_age

# Remove rows with outliers
df_train = df_train[~combined_outlier_rows]
print(f"Removed {combined_outlier_rows.sum()} rows due to outliers. There are now {df_train.shape[0]} rows in our training data.")
# print(df_train.head())

# Separate the resulting column from the training data
adoption_speed_train = df_train.pop('AdoptionSpeed')

Removed 197 rows due to outliers. There are now 14796 rows in our training data.


In [3]:
# Separating categorical features
binary_categorical_columns = ['Gender','Vaccinated','Dewormed', 'Sterilized']
ordinal_categorical_columns = ['MaturitySize', 'FurLength', 'Health', 'Color1', 'Color2', 'Color3']
nonordinal_categorical_columns = ['Breed1', 'Breed2']

from sklearn.preprocessing import LabelEncoder
# Encoding
label_encoder = LabelEncoder()
df_train = pd.get_dummies(df_train, columns=binary_categorical_columns)
for column in ordinal_categorical_columns:
  df_train[column] = label_encoder.fit_transform(df_train[column])

# Split non-ordinal categorical columns into n features (similar to Benjamin's preprocessing)
n = 10

for feature in nonordinal_categorical_columns:
  top_N_values = df_train[feature].value_counts().head(n)
  print(f'Top {n} values for {feature}:\n{top_N_values}\n')

  top_N_value_names = top_N_values.index
  for index, row in df_train.iterrows():
    # If value isn't top N frequency, replace with -1 (other)
    if row[feature] not in top_N_value_names:
      df_train.at[index, feature] = -1

  df_train = pd.get_dummies(df_train, columns=[feature])

Top 10 values for Breed1:
307    5903
266    3623
265    1257
299     342
264     291
292     262
285     202
141     199
205     175
218     161
Name: Breed1, dtype: int64

Top 10 values for Breed2:
0      10613
307     1723
266      597
265      321
299      137
264      116
292      104
218       91
141       85
285       76
Name: Breed2, dtype: int64



In [4]:
from sklearn.model_selection import train_test_split
# Split 85% training, 10% test, 5% validation
df_train_85, X_temp, adoption_speed_train_85, y_temp = train_test_split(df_train, adoption_speed_train, test_size=0.15, random_state=42, stratify=adoption_speed_train)    # 85% training set, 15% temp set
df_test_10, df_val_5, adoption_speed_test_10, adoption_speed_val_5 = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp)       # 10% test set, 5% validation set

End of pre-processing.

# Random Forest Model

Used for understanding Random Forest Classification:  
https://www.analyticsvidhya.com/blog/2021/06/understanding-random-forest/  
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 

How the Random Forest Model works:
* Create n decision trees based on a subset of the training sample (using parameters).  
* Pass test data into each decision tree and choose the average/majority result from those.  

In [5]:
X = df_train_85
Y = adoption_speed_train_85

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification

# Default Classifier (with random_state=2 for demonstration)
classifier = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=2)
classifier.fit(X, Y)
print('OOB score:', classifier.oob_score_)

OOB score: 0.3928912213740458


# Default Parameters Validation

In [16]:
# method taken from Benjamin Denzler

from sklearn.metrics import precision_score, recall_score
classifier = RandomForestClassifier(n_jobs=-1, oob_score=True)		# removed random_state
classifier.fit(df_train_85, adoption_speed_train_85)
test_predictions = classifier.predict(df_test_10)

accuracy = classifier.score(df_train_85, adoption_speed_train_85)
print(f'Accuracy on training: {round(accuracy, 3) * 100}%')

accuracy = classifier.score(df_test_10, adoption_speed_test_10)
print(f'Accuracy on test: {round(accuracy, 3) * 100}%')

accuracy = classifier.score(df_val_5, adoption_speed_val_5)
print(f'Accuracy on validation: {round(accuracy, 3) * 100}%')

# zero_divison=0 returns 0 precision/recall if no positive samples for a class
avg_precision = precision_score(
    adoption_speed_test_10, test_predictions, average='weighted', zero_division=0
)
avg_recall = recall_score(
    adoption_speed_test_10, test_predictions, average='weighted', zero_division=0
)
precision_per_class = precision_score(
    adoption_speed_test_10, test_predictions, average=None, zero_division=0
)
recall_per_class = recall_score(
    adoption_speed_test_10, test_predictions, average=None, zero_division=0
)

print(f'Average weighted precision: {round(avg_precision, 3) * 100}%')
print(f'Average weighted recall: {round(avg_recall, 3) * 100}%')
for class_label, precision, recall in zip(range(len(precision_per_class)), precision_per_class, recall_per_class):
    print(f'Class {class_label}: Precision = {round(precision, 3) * 100}%, Recall = {round(recall, 3) * 100}%')

Accuracy on training: 98.4%
Accuracy on test: 38.4%
Accuracy on validation: 40.5%
Average weighted precision: 37.1%
Average weighted recall: 38.4%
Class 0: Precision = 15.4%, Recall = 5.0%
Class 1: Precision = 33.900000000000006%, Recall = 34.1%
Class 2: Precision = 35.8%, Recall = 37.8%
Class 3: Precision = 32.5%, Recall = 23.200000000000003%
Class 4: Precision = 46.5%, Recall = 57.099999999999994%


In [18]:
# validation method taken from Benjamin Denzler
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score

num_folds = 10

scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(
        precision_score, average='weighted', zero_division=0
    ),
    'recall': make_scorer(
        recall_score, average='weighted', zero_division=0
    )
}

scores = cross_validate(
    classifier, df_train_85, adoption_speed_train_85,
    cv=num_folds, scoring=scoring_metrics
)

print(f'\n{num_folds}-fold validation accuracy mean: {round(scores["test_accuracy"].mean(), 3) * 100}%')
print(f'{num_folds}-fold validation precision mean: {round(scores["test_precision"].mean(), 3) * 100}%')
print(f'{num_folds}-fold validation recall mean: {round(scores["test_recall"].mean(), 3) * 100}%')


10-fold validation accuracy mean: 39.5%
10-fold validation precision mean: 38.4%
10-fold validation recall mean: 39.5%


# Hyperparameters:  
There are a few parameters that we can adjust to try to get closer to our goal.  
* n_estimators - the number of decision trees created
* max_features - the maximum number of features 
* min_samples_leaf - the minumum number of leaves required to split an internal node
* criterion - the method used to split nodes (Entropy, Gini impurity, Log Loss)
* max_leaf_nodes - maximum leaf nodes in a tree
* max_depth - limit the amount of decisions we are doing per tree

In [8]:
# Add hyperparameters to get a better accuracy
classifier = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=2, n_estimators=200, max_features=8)
classifier.fit(X, Y)
print('OOB score:', classifier.oob_score_)

OOB score: 0.3955947837150127


Using GridSearch, all combinations of predetermined hyperparameters are tested and the highest scoring classifier is used and returned.  
n_estimators, max_features, and min_sample_leaf are iterated through. These parameters help scale the amount of information that is used when creating the decision trees.  
Through testing, entropy was found to have the most success. max_leaf_nodes and max_depth both help limit the tree so that it does not overfit. However with the parameters and dataset given, making these values infinite (=None) increased the accuracy score.  

In [9]:
classifier = RandomForestClassifier(n_jobs=-1,criterion='entropy')

n_features = len(df_train_85.columns)
#Hyperparameters
params = {											# defaults
	'n_estimators': [100,150,200,250,300],			# 100
	'max_features': [5,10,15,20,30],				# 'sqrt' = ~5
	'min_samples_leaf': [1,2,5,10,20],				# 1
	# 'criterion': ['gini','entropy','log_loss'],		# 'gini'
	# 'max_leaf_nodes': [50,100,150,200],				# None
	# 'max_depth': [10,20,25,30]						# None
}

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator=classifier, param_grid=params, cv=5, n_jobs=-1, verbose=1, scoring="accuracy")
gs.fit(X, Y)

print(gs.best_score_, gs.best_estimator_)

Fitting 5 folds for each of 125 candidates, totalling 625 fits
0.41149867093148573 RandomForestClassifier(criterion='entropy', max_features=10, min_samples_leaf=5,
                       n_estimators=300, n_jobs=-1)


# Accuracy and Precision Metrics  
These methods are used to see the accuracy, precision, and recall of our classifier. We also used a 10-fold cross validation as our validation statistic.  
The method used were programmed by Benjamin Denzler.  

In [12]:
# accuracy and precision method taken from Benjamin Denzler

from sklearn.metrics import precision_score, recall_score
best_rfm = gs.best_estimator_
best_rfm.fit(df_train_85, adoption_speed_train_85)
test_predictions = best_rfm.predict(df_test_10)

accuracy = best_rfm.score(df_train_85, adoption_speed_train_85)
print(f'Accuracy on training: {round(accuracy, 3) * 100}%')

accuracy = best_rfm.score(df_test_10, adoption_speed_test_10)
print(f'Accuracy on test: {round(accuracy, 3) * 100}%')

accuracy = best_rfm.score(df_val_5, adoption_speed_val_5)
print(f'Accuracy on validation: {round(accuracy, 3) * 100}%')

# zero_divison=0 returns 0 precision/recall if no positive samples for a class
avg_precision = precision_score(
    adoption_speed_test_10, test_predictions, average='weighted', zero_division=0
)
avg_recall = recall_score(
    adoption_speed_test_10, test_predictions, average='weighted', zero_division=0
)
precision_per_class = precision_score(
    adoption_speed_test_10, test_predictions, average=None, zero_division=0
)
recall_per_class = recall_score(
    adoption_speed_test_10, test_predictions, average=None, zero_division=0
)

print(f'Average weighted precision: {round(avg_precision, 3) * 100}%')
print(f'Average weighted recall: {round(avg_recall, 3) * 100}%')
for class_label, precision, recall in zip(range(len(precision_per_class)), precision_per_class, recall_per_class):
    print(f'Class {class_label}: Precision = {round(precision, 3) * 100}%, Recall = {round(recall, 3) * 100}%')

Accuracy on training: 69.19999999999999%
Accuracy on test: 40.1%
Accuracy on validation: 41.4%
Average weighted precision: 40.5%
Average weighted recall: 40.1%
Class 0: Precision = 100.0%, Recall = 2.5%
Class 1: Precision = 35.0%, Recall = 31.8%
Class 2: Precision = 36.0%, Recall = 41.4%
Class 3: Precision = 35.8%, Recall = 17.599999999999998%
Class 4: Precision = 46.800000000000004%, Recall = 66.3%


In [14]:
# validation method taken from Benjamin Denzler
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score

num_folds = 10

scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(
        precision_score, average='weighted', zero_division=0
    ),
    'recall': make_scorer(
        recall_score, average='weighted', zero_division=0
    )
}

scores = cross_validate(
    best_rfm, df_train_85, adoption_speed_train_85,
    cv=num_folds, scoring=scoring_metrics
)

print(f'\n{num_folds}-fold validation accuracy mean: {round(scores["test_accuracy"].mean(), 3) * 100}%')
print(f'{num_folds}-fold validation precision mean: {round(scores["test_precision"].mean(), 3) * 100}%')
print(f'{num_folds}-fold validation recall mean: {round(scores["test_recall"].mean(), 3) * 100}%')


10-fold validation accuracy mean: 41.3%
10-fold validation precision mean: 40.699999999999996%
10-fold validation recall mean: 41.3%
