In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.model_selection import GridSearchCV
from threading import *
import sklearn.ensemble as ensemble

In [2]:
import pydotplus
import time

In [3]:
#load data
df=pd.read_csv("ml_version_merged.csv")[["Average_bleaching",
                                        "ClimSST",
                                        "Temperature_Kelvin",
                                        "Temperature_Kelvin_Standard_Deviation",
                                        "SSTA_Frequency",
                                        "SSTA_Frequency_Standard_Deviation",
                                        "TSA_Frequency_Standard_Deviation",
                                        "mean_cur"]]
df['Average_bleaching']=df['Average_bleaching'].map(lambda x: 0 if x==0 else 1)
X=df.drop(columns="Average_bleaching")
Y=df["Average_bleaching"]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14096 entries, 0 to 14095
Data columns (total 8 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Average_bleaching                      14096 non-null  int64  
 1   ClimSST                                14096 non-null  float64
 2   Temperature_Kelvin                     14096 non-null  float64
 3   Temperature_Kelvin_Standard_Deviation  14096 non-null  float64
 4   SSTA_Frequency                         14096 non-null  float64
 5   SSTA_Frequency_Standard_Deviation      14096 non-null  float64
 6   TSA_Frequency_Standard_Deviation       14096 non-null  float64
 7   mean_cur                               14096 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 881.1 KB


In [None]:
#This is just how I use to find the params. Running is not recommended.
#by using grid search, we can find the best params for the classifier
params_grid={'n_estimators':np.arange(2,52,2),'max_depth':np.arange(1,11,1),"min_samples_leaf":np.arange(1,11,1),"min_samples_split":np.arange(2,11,1)}
model=RandomForestClassifier()
GS=GridSearchCV(model, param_grid=params_grid,cv=5,n_jobs=-1)
GS.fit(X,Y)
best_params=GS.best_params_
best_score=GS.best_score_ 
print(best_params,best_score)

In [5]:
#train the model
#test_size=0.2 means the test part occupies 1/5 of the sample.
train_X,test_X,train_Y,test_Y=train_test_split(X,Y,test_size=0.2,random_state=1)
#put params into the model and evaluate.
rf=RandomForestClassifier(n_estimators=22,max_depth=3,min_samples_leaf=7,min_samples_split=6)
rf.fit(train_X,train_Y)
print(rf.score(test_X,test_Y))
print(rf.score(train_X,train_Y))

0.8773049645390071
0.8709648811635332


In [16]:
df.head()

Unnamed: 0,Average_bleaching,ClimSST,Temperature_Kelvin,Temperature_Kelvin_Standard_Deviation,SSTA_Frequency,SSTA_Frequency_Standard_Deviation,TSA_Frequency_Standard_Deviation,mean_cur
0,0,297.28,295.6,2.15,12.0,3.34,1.1,0.185999
1,0,297.28,295.6,2.15,12.0,3.34,1.1,0.279372
2,0,297.28,295.6,2.15,12.0,3.34,1.1,0.52045
3,0,297.28,295.6,2.15,12.0,3.34,1.1,0.362635
4,0,297.28,295.6,2.15,12.0,3.34,1.1,0.462817


In [21]:
test_df = pd.DataFrame(columns=['ClimSST', 'Temperature_Kelvin',
                                    "Temperature_Kelvin_Standard_Deviation", "SSTA_Frequency",
                                    "SSTA_Frequency_Standard_Deviation", "TSA_Frequency_Standard_Deviation",
                                    "mean_cur"])
test_df.loc[len(test_df.index)] = [297.28,295.6,2.15,12,3.34,1.1,0.18]
predicted = rf.predict(test_df)

In [22]:

predicted[0]

1

In [6]:
# Save model 
import pickle
model_path = './random_forest.pickle'
pickle.dump(rf, open(model_path,'wb'))

In [None]:
#visualize,range is related to n_estimators
for n in range(6):
    export_tree=rf.estimators_[n]
    dot_tree=export_graphviz(export_tree,
                             out_file=None,
                             rounded=True,
                             filled=True,
                             feature_names=["ClimSST",
                                        "Temperature_Kelvin",
                                        "Temperature_Kelvin_Standard_Deviation",
                                        "SSTA_Frequency",
                                        "SSTA_Frequency_Standard_Deviation",
                                        "TSA_Frequency_Standard_Deviation",
                                        "mean_cur"],
                             class_names=["Bleached","Not bleached"])
    graph=pydotplus.graph_from_dot_data(dot_tree)
    graph.write_png(f"tree {n} graph.png".format(n))

In [None]:
importances = rf.feature_importances_
feat_labels = df.columns[1:]
indices = np.argsort(importances)[::-1]
for f in range(train_X.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))


In [None]:
t0 = time.time()
train_X,test_X,train_Y,test_Y=train_test_split(X,Y,train_size=0.1,random_state=1)
test_est=rf.predict(train_X)
t1 = time.time()
total1 = t1-t0
print(total1)

In [None]:
t0 = time.time()
train_X,test_X,train_Y,test_Y=train_test_split(X,Y,train_size=0.5,random_state=1)
test_est=rf.predict(train_X)
t1 = time.time()
total2 = t1-t0
print(total2)

In [None]:
t0 = time.time()
train_X,test_X,train_Y,test_Y=train_test_split(X,Y,train_size=0.7,random_state=1)
test_est=rf.predict(train_X)
t1 = time.time()
total3 = t1-t0
print(total3)

In [None]:
li_size=[0.1,0.5,0.7]
li_t=[total1,total2,total3]
line_rf=plt.plot(li_size,li_t,'r--',label='type1')
plt.title('scalability of models')
plt.xlabel('train_size')
plt.ylabel('time diff')
plt.legend()
plt.show()

In [None]:
scores = cross_val_score(rf,X,Y,cv=5, scoring='accuracy')
print(scores)

acc_std = np.std(scores)
print("The standard deviation of accuracies is "+str(acc_std))

In [None]:
plt.boxplot(scores)
plt.grid(linestyle="--", alpha=0.3)
plt.xlabel("cv5")
plt.ylabel("accuracies")
plt.title("the boxplot of cv5 accuracies")
plt.show()