In [1]:
import pandas as pd 
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

In [12]:
#pip install pycaret
# Tutorial source: https://github.com/pycaret/pycaret/blob/master/tutorials/Multiclass%20Classification%20Tutorial%20Level%20Beginner%20-%20MCLF101.ipynb

In [2]:
# Import CSV
dataset = pd.read_csv('CEDdata_smol.csv')

In [3]:
dataset.shape

(33303, 12)

In [4]:
dataset.head()

Unnamed: 0,trans_name,st_name,SPEC,segspec_ct,segdist_sums,area_sums,density,log_density,dec_lat,dec_long,distance,date
0,CR,CR20,POJA,0,8.117538,2.435261,0.0,0.0,46.163916,-124.453083,20,5/21/2003
1,CR,CR15,POJA,0,26.83953,8.051859,0.0,0.0,46.159334,-124.341831,15,5/21/2003
2,GH,GH06,POJA,0,10.340468,3.10214,0.0,0.0,47.004917,-124.329082,6,5/22/2003
3,GH,GH16,POJA,0,10.415742,3.124723,0.0,0.0,46.994583,-124.557667,16,5/22/2003
4,GH,GH21,POJA,0,9.75025,2.925075,0.0,0.0,46.991833,-124.695415,21,5/22/2003


#### Multiclass classification

In [5]:
# Withhold data to be used for predictions to check model accuracies
data = dataset.sample(frac=0.9, random_state=786)
data_unseen = dataset.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (29973, 12)
Unseen Data For Predictions: (3330, 12)


In [6]:
# Prepare environment and data for modeling and deployment
from pycaret.classification import *
exp_mclf101 = setup(data = data, target = 'SPEC', session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,SPEC
2,Target Type,Multiclass
3,Label Encoded,"ANMU: 0, ARTE: 1, BFAL: 2, BLBR: 3, BOGU: 4, BRAC: 5, BRAN: 6, BRPE: 7, CAAU: 8, CAGU: 9, CATE: 10, COLO: 11, COMU: 12, COTE: 13, DCCO: 14, FTSP: 15, GBHE: 16, GRSC: 17, GWGU: 18, HADU: 19, HEEG: 20, LESP: 21, MAMU: 22, NOFU: 23, PAAU: 24, PAJA: 25, PALO: 26, PECO: 27, PFSH: 28, PIGU: 29, POJA: 30, RBGU: 31, RBME: 32, REPH: 33, RHAU: 34, RNPH: 35, RTLO: 36, SAGU: 37, SHSP: 38, SOSH: 39, SPSK: 40, SUSC: 41, TOWA: 42, TUPU: 43, UNDO: 44, UNDU: 45, UNGO: 46, UNSP: 47, WEGR: 48, WEGU: 49, WWSC: 50"
4,Original Data,"(29973, 12)"
5,Missing Values,False
6,Numeric Features,8
7,Categorical Features,2
8,Ordinal Features,False
9,High Cardinality Features,False


In [7]:
# Compare models to evaluate performance
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.0269,0.5131,0.0273,0.0132,0.0129,0.0073,0.0076,1.173
svm,SVM - Linear Kernel,0.0264,0.0,0.0267,0.0116,0.0094,0.007,0.0163,18.845
lda,Linear Discriminant Analysis,0.0204,0.4418,0.0208,0.0214,0.0198,0.0007,0.0007,0.238
ridge,Ridge Classifier,0.0202,0.0,0.0207,0.0129,0.0143,0.0006,0.0006,0.039
nb,Naive Bayes,0.019,0.4676,0.0192,0.0154,0.0107,-0.0008,-0.0008,0.203
qda,Quadratic Discriminant Analysis,0.019,0.5002,0.02,0.0007,0.0014,0.0004,0.0006,0.169
lr,Logistic Regression,0.0182,0.4621,0.0185,0.0209,0.0179,-0.0016,-0.0016,32.536
gbc,Gradient Boosting Classifier,0.0125,0.3618,0.013,0.0208,0.0142,-0.0073,-0.0073,87.653
knn,K Neighbors Classifier,0.0111,0.4645,0.0116,0.0152,0.0112,-0.0089,-0.009,0.779
rf,Random Forest Classifier,0.0109,0.2705,0.0114,0.0101,0.0104,-0.0089,-0.0089,1.257


In [9]:
# Create models for ADA, SVM, LDA
ada = create_model('ada')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.0281,0.5089,0.0293,0.0234,0.0146,0.0085,0.0088
1,0.0229,0.5098,0.0232,0.0098,0.0108,0.0031,0.0032
2,0.0257,0.5132,0.0258,0.0136,0.0119,0.0059,0.0064
3,0.0234,0.5138,0.0238,0.0142,0.0132,0.0038,0.0039
4,0.0243,0.5198,0.024,0.0066,0.0096,0.0044,0.0047
5,0.0281,0.5102,0.0289,0.0146,0.0151,0.0086,0.0088
6,0.0281,0.5229,0.0285,0.0122,0.0141,0.0086,0.009
7,0.0296,0.5076,0.0297,0.0166,0.0149,0.0099,0.0104
8,0.0305,0.5168,0.0308,0.01,0.0139,0.011,0.0117
9,0.0281,0.5077,0.0293,0.0106,0.0113,0.0087,0.0092


In [11]:
print(ada)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=123)


In [8]:
# SVM
svm = create_model('svm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.0281,0.0,0.0287,0.0124,0.0136,0.0091,0.0113
1,0.0176,0.0,0.019,0.0074,0.003,-0.0005,-0.0046
2,0.0281,0.0,0.0289,0.0043,0.0062,0.0088,0.0316
3,0.0305,0.0,0.0307,0.0128,0.0145,0.0113,0.013
4,0.0267,0.0,0.0263,0.0066,0.0073,0.0068,0.0228
5,0.0286,0.0,0.0281,0.0126,0.0078,0.0087,0.0246
6,0.0267,0.0,0.0263,0.0063,0.0065,0.0068,0.0187
7,0.0219,0.0,0.0219,0.0231,0.0098,0.0021,0.0032
8,0.0272,0.0,0.0278,0.0121,0.0131,0.0077,0.0108
9,0.0281,0.0,0.0293,0.0185,0.0124,0.0092,0.0316


In [12]:
print(svm)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
              power_t=0.5, random_state=123, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [10]:
# LDA
lda = create_model('lda')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.0191,0.4374,0.0195,0.021,0.0188,-0.0006,-0.0006
1,0.0186,0.4387,0.019,0.0196,0.0177,-0.0011,-0.0011
2,0.0162,0.43,0.0166,0.018,0.0166,-0.0035,-0.0035
3,0.0224,0.4508,0.0228,0.0214,0.0212,0.0028,0.0028
4,0.0205,0.4444,0.0207,0.0223,0.0206,0.0008,0.0008
5,0.0186,0.4444,0.0189,0.0216,0.018,-0.0011,-0.0011
6,0.0191,0.4368,0.0196,0.0185,0.0178,-0.0006,-0.0006
7,0.0257,0.4471,0.0264,0.0267,0.0256,0.0062,0.0063
8,0.0214,0.439,0.022,0.0234,0.0211,0.0018,0.0019
9,0.0219,0.4493,0.0227,0.0214,0.0208,0.0024,0.0024


In [13]:
print(lda)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)


In [14]:
# Tune models
# ADA

tuned_ada = tune_model(ada)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.0348,0.5398,0.0343,0.0114,0.0079,0.0147,0.0409
1,0.0329,0.5476,0.0319,0.0036,0.0059,0.0123,0.0327
2,0.0353,0.54,0.0343,0.0052,0.008,0.0147,0.044
3,0.0357,0.5457,0.0349,0.005,0.0079,0.0156,0.0444
4,0.0343,0.5439,0.0335,0.0043,0.0069,0.0142,0.0387
5,0.0324,0.5431,0.0318,0.0141,0.007,0.0123,0.0346
6,0.0348,0.5457,0.0343,0.0044,0.0071,0.0147,0.0403
7,0.0367,0.5449,0.0363,0.0047,0.0077,0.0166,0.0449
8,0.0343,0.5403,0.0338,0.0048,0.0075,0.0142,0.0406
9,0.0334,0.545,0.0328,0.004,0.0065,0.0132,0.0366


In [17]:
print(tuned_ada)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                   learning_rate=0.0005, n_estimators=260, random_state=123)


In [15]:
# SVM

tuned_svm = tune_model(svm)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.031,0.0,0.0312,0.0067,0.0089,0.0109,0.0207
1,0.0243,0.0,0.024,0.0141,0.0097,0.0043,0.006
2,0.0248,0.0,0.025,0.0328,0.0094,0.0054,0.0326
3,0.0281,0.0,0.0273,0.0227,0.0114,0.0078,0.0213
4,0.0253,0.0,0.025,0.0168,0.0087,0.0054,0.0345
5,0.0238,0.0,0.0234,0.0085,0.0071,0.0037,0.0048
6,0.0253,0.0,0.0252,0.0118,0.008,0.0054,0.0274
7,0.03,0.0,0.0295,0.0303,0.0135,0.0098,0.0574
8,0.0229,0.0,0.0235,0.0303,0.0071,0.0039,0.0193
9,0.03,0.0,0.031,0.0203,0.0113,0.0112,0.0433


In [18]:
print(tuned_svm)

SGDClassifier(alpha=0.05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001,
              fit_intercept=False, l1_ratio=0.8900000001,
              learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=123, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [16]:
# LDA
tuned_lda = tune_model(lda)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.0276,0.5223,0.0277,0.0191,0.0141,0.0078,0.0094
1,0.0229,0.5081,0.0229,0.0204,0.0132,0.0032,0.0036
2,0.0276,0.5045,0.0276,0.0292,0.0161,0.008,0.0088
3,0.0296,0.5186,0.0295,0.0332,0.0197,0.01,0.0112
4,0.0272,0.5086,0.0271,0.0362,0.0168,0.0076,0.0083
5,0.0291,0.5167,0.029,0.022,0.0161,0.0093,0.0108
6,0.0291,0.5144,0.0291,0.0227,0.0174,0.0095,0.0106
7,0.0296,0.5255,0.0293,0.0286,0.017,0.0098,0.0111
8,0.0272,0.5133,0.027,0.0223,0.0171,0.0075,0.0091
9,0.0296,0.5267,0.0297,0.0272,0.0185,0.0101,0.0112


In [19]:
print(tuned_lda)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=1,
                           solver='lsqr', store_covariance=False, tol=0.0001)


In [20]:
# Evaluate models
# ADA
evaluate_model(tuned_ada)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [22]:
# SVM
evaluate_model(tuned_svm)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [23]:
# LDA
evaluate_model(tuned_lda)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [24]:
# Predict model on hold-out sample
# ADA
predict_model(tuned_ada);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.0355,0.5505,0.0349,0.0118,0.0094,0.017,0.0447


In [26]:
# KVM
predict_model(tuned_svm);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.0257,0,0.0258,0.0247,0.0111,0.007,0.0309


In [27]:
# LDA
predict_model(tuned_lda);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.0299,0.521,0.0294,0.0311,0.0194,0.0107,0.0124


In [28]:
# Finalize model for deployment
# ADA
final_ada = finalize_model(tuned_ada)

In [32]:
print(final_ada)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                   learning_rate=0.0005, n_estimators=260, random_state=123)


In [30]:
# KVM
final_svm = finalize_model(tuned_svm)

In [33]:
print(final_svm)

SGDClassifier(alpha=0.05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001,
              fit_intercept=False, l1_ratio=0.8900000001,
              learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=123, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [31]:
# LDA
final_lda = finalize_model(tuned_lda)

In [34]:
print(final_lda)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=1,
                           solver='lsqr', store_covariance=False, tol=0.0001)


In [35]:
# Predict on unseen data
# ADA
unseen_pred_ada = predict_model(final_ada, data = data_unseen)

In [38]:
unseen_pred_ada.head()

Unnamed: 0,trans_name,st_name,SPEC,segspec_ct,segdist_sums,area_sums,density,log_density,dec_lat,dec_long,distance,date,Label,Score
0,CR,CR15,POJA,0,26.83953,8.051859,0.0,0.0,46.159334,-124.341831,15,5/21/2003,COTE,0.0213
1,GH,GH10,POJA,0,8.715432,2.61463,0.0,0.0,47.005083,-124.422333,10,5/22/2003,COTE,0.0213
2,CM,CM20,POJA,0,13.341302,4.00239,0.0,0.0,45.490833,-124.450916,20,5/23/2003,COTE,0.0213
3,CR,CR15,POJA,0,3.979261,1.193778,0.0,0.0,46.158476,-124.313351,15,5/23/2004,COTE,0.0213
4,GH,GH10,POJA,0,9.347516,2.804255,0.0,0.0,47.009666,-124.424667,10,5/28/2006,COTE,0.0213


In [36]:
# SVM
unseen_pred_svm = predict_model(final_svm, data = data_unseen)

In [39]:
unseen_pred_svm.head()

Unnamed: 0,trans_name,st_name,SPEC,segspec_ct,segdist_sums,area_sums,density,log_density,dec_lat,dec_long,distance,date,Label
0,CR,CR15,POJA,0,26.83953,8.051859,0.0,0.0,46.159334,-124.341831,15,5/21/2003,BFAL
1,GH,GH10,POJA,0,8.715432,2.61463,0.0,0.0,47.005083,-124.422333,10,5/22/2003,WEGR
2,CM,CM20,POJA,0,13.341302,4.00239,0.0,0.0,45.490833,-124.450916,20,5/23/2003,WEGR
3,CR,CR15,POJA,0,3.979261,1.193778,0.0,0.0,46.158476,-124.313351,15,5/23/2004,WEGR
4,GH,GH10,POJA,0,9.347516,2.804255,0.0,0.0,47.009666,-124.424667,10,5/28/2006,WEGR


In [37]:
# LDA
unseen_pred_lda = predict_model(final_lda, data = data_unseen)

In [40]:
unseen_pred_lda.head()

Unnamed: 0,trans_name,st_name,SPEC,segspec_ct,segdist_sums,area_sums,density,log_density,dec_lat,dec_long,distance,date,Label,Score
0,CR,CR15,POJA,0,26.83953,8.051859,0.0,0.0,46.159334,-124.341831,15,5/21/2003,SPSK,0.0286
1,GH,GH10,POJA,0,8.715432,2.61463,0.0,0.0,47.005083,-124.422333,10,5/22/2003,UNDO,0.0224
2,CM,CM20,POJA,0,13.341302,4.00239,0.0,0.0,45.490833,-124.450916,20,5/23/2003,SAGU,0.0277
3,CR,CR15,POJA,0,3.979261,1.193778,0.0,0.0,46.158476,-124.313351,15,5/23/2004,UNDU,0.0228
4,GH,GH10,POJA,0,9.347516,2.804255,0.0,0.0,47.009666,-124.424667,10,5/28/2006,UNDO,0.0226
