In [0]:
from google.colab import files
import io
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

#Classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB

sns.set(style="white")

In [0]:
uploaded = files.upload()

Saving training_dataset.csv to training_dataset.csv


In [0]:
bible = pd.read_csv(io.StringIO(uploaded['training_dataset.csv'].decode('utf-8')))

#Data Analysis

In [0]:
bible.head(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,scribe
0,0,-0.091897,0.2976,0.079145,0.196496,0.261718,1.26996,0.446679,-0.751707,0.001721,0.998901,Philippus
1,1,-0.091897,0.226939,0.267634,0.024091,0.261718,-0.806282,0.597681,-0.601277,0.126447,-0.909619,Paithonius
2,2,0.167323,0.313302,0.168055,-0.383198,0.261718,0.190314,0.824183,0.55825,-0.247731,-0.148073,Marcus
3,3,-0.017834,-0.22843,0.37077,1.293671,0.17234,0.896237,0.182426,0.416867,1.373706,0.868284,Noaelius
4,4,0.043885,0.407516,-0.120014,0.281743,0.261718,-0.183409,0.106925,0.142896,0.531806,-0.101311,Marcus
5,5,9.943651,-0.220579,-0.048886,-0.675372,-3.045274,3.719926,2.37195,-2.598171,-2.898156,1.370295,Begonius
6,6,-0.178304,-0.189174,0.264078,-0.640492,0.17234,0.231839,0.144676,-1.605713,-1.900349,0.266479,Marcus
7,7,-0.178304,0.258344,-0.12357,0.228303,0.082961,-0.266459,0.824183,-0.109251,-0.310094,-0.552848,Philippus
8,8,0.142636,-0.142067,0.246296,0.789259,-1.347089,1.145386,0.710932,0.51142,-0.185368,0.728878,Marcus
9,9,0.031541,-2.426761,0.37077,0.77977,0.17234,0.397939,0.635431,1.033035,0.344717,0.115959,Marcus


In [0]:
bible.describe()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
count,12017.0,12017.0,12017.0,12017.0,12017.0,12017.0,12017.0,12017.0,12017.0,12017.0,12017.0
mean,6008.0,0.018129,0.018044,-0.000388,-0.024999,0.01101,0.005436,0.033862,-0.011483,-0.014828,-0.015306
std,3469.153427,1.014528,3.624547,1.067595,0.992491,0.963444,1.103847,1.238767,1.000042,1.071147,0.981989
min,0.0,-3.498799,-2.426761,-3.210528,-5.440122,-4.922215,-7.450257,-11.935457,-4.247781,-5.423855,-6.719324
25%,3004.0,-0.116585,-0.259834,0.061363,-0.542563,0.17234,-0.598658,-0.044076,-0.571435,-0.372457,-0.517827
50%,6008.0,0.056229,-0.055704,0.214288,0.06482,0.261718,-0.058835,0.220177,0.100338,0.064084,-0.036297
75%,9012.0,0.216699,0.203385,0.349432,0.600056,0.261718,0.564038,0.446679,0.636669,0.469443,0.505617
max,12016.0,11.819916,386.0,50.0,3.987152,1.066121,53.0,83.0,4.580832,44.0,7.654104


In [0]:
bible.dtypes

id          int64
F1        float64
F2        float64
F3        float64
F4        float64
F5        float64
F6        float64
F7        float64
F8        float64
F9        float64
F10       float64
scribe     object
dtype: object

In [0]:
bible.isna().sum()

id        0
F1        0
F2        0
F3        0
F4        0
F5        0
F6        0
F7        0
F8        0
F9        0
F10       0
scribe    0
dtype: int64

# X & y

In [0]:
y = bible['scribe']
X = bible[["F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "F10"]]

# Data Normalization

In [0]:
normalizer = Normalizer()
X = pd.DataFrame(normalizer.fit_transform(X))

In [0]:
X.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
0,-0.091897,0.2976,0.079145,0.196496,0.261718,1.26996,0.446679,-0.751707,0.001721,0.998901
1,-0.091897,0.226939,0.267634,0.024091,0.261718,-0.806282,0.597681,-0.601277,0.126447,-0.909619
2,0.167323,0.313302,0.168055,-0.383198,0.261718,0.190314,0.824183,0.55825,-0.247731,-0.148073
3,-0.017834,-0.22843,0.37077,1.293671,0.17234,0.896237,0.182426,0.416867,1.373706,0.868284
4,0.043885,0.407516,-0.120014,0.281743,0.261718,-0.183409,0.106925,0.142896,0.531806,-0.101311


# Data Preparation for Training

In [0]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(9613, 10) (9613,)
(2404, 10) (2404,)


In [0]:
X.corr()[X.corr() >= 0.95]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,,,,,,,,,
1,,1.0,,,,,,,,
2,,,1.0,,,,,,,
3,,,,1.0,,,,,,
4,,,,,1.0,,,,,
5,,,,,,1.0,,,,
6,,,,,,,1.0,,,
7,,,,,,,,1.0,,
8,,,,,,,,,1.0,
9,,,,,,,,,,1.0


---------------------------------

# Models Testing

In [0]:
models = {
    "decision_tree": DecisionTreeClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoostClassifier": MLPClassifier(),
    "forest": RandomForestClassifier(),
    "GaussianNB" : GaussianNB()
    
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)

d = {modelName:model.predict(X_test) for modelName, model in models.items()}
df = pd.DataFrame(d)
df["gt"] = y_test.reset_index(drop=True)

switchColor = lambda value: f"<b style=\"color:orange\">{round(value,2)}</b>" if value < 0.8 else f"<b style=\"color:green\">{round(value,2)}</b>"
printBonito = lambda label,val: display(HTML(f"<span style=\"padding-left:20px\">The {label} is: {switchColor(val)}</span>"))

for modelName, model in models.items():    
    print(f"Evaluating model [{modelName}]:")
    printBonito("Accuracy", accuracy_score(df["gt"],df[modelName]))
    printBonito("Precision", precision_score(df["gt"],df[modelName],average='weighted'))
    printBonito("Recall", recall_score(df["gt"],df[modelName],average='weighted'))

Training model: decision_tree
Training model: GradientBoostingClassifier
Training model: AdaBoostClassifier




Training model: forest
Training model: GaussianNB
Evaluating model [decision_tree]:


Evaluating model [GradientBoostingClassifier]:


Evaluating model [AdaBoostClassifier]:


Evaluating model [forest]:


Evaluating model [GaussianNB]:


# Gradient Boosting Parametrization

In [0]:
### Trying Parameters:
n_estimators = [1800, 2500]
min_samples_split = [50, 100]
min_samples_leaf = [50, 100]
for n in n_estimators:
  for s in min_samples_split:
    for l in min_samples_leaf:
      tree = GradientBoostingClassifier(n_estimators=n, min_samples_split=s, min_samples_leaf=l)
      print("n_estimators:", n, "min_samples_split:", s, "min_samples_leaf:", l)
      tree.fit(X_train, y_train)
      prediction = tree.predict(X_test)
      print("Accuracy:", round(accuracy_score(y_test, prediction),4))
      print("Precision:", round(precision_score(y_test, prediction,average='weighted'),4))
      print(f"Recall: {round(recall_score(y_test, prediction,average='weighted'),4)}\n")

n_estimators: 1800 min_samples_split: 50 min_samples_leaf: 50
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 1800 min_samples_split: 50 min_samples_leaf: 100
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 1800 min_samples_split: 100 min_samples_leaf: 50
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 1800 min_samples_split: 100 min_samples_leaf: 100
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 2500 min_samples_split: 50 min_samples_leaf: 50
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 2500 min_samples_split: 50 min_samples_leaf: 100
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 2500 min_samples_split: 100 min_samples_leaf: 50
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983

n_estimators: 2500 min_samples_split: 100 min_samples_leaf: 100
Accuracy: 0.9983
Precision: 0.9983
Recall: 0.9983



In [0]:
chosen_gradient = GradientBoostingClassifier(n_estimators=2500, min_samples_split=100, min_samples_leaf=100)
chosen_gradient.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=100, min_samples_split=100,
                           min_weight_fraction_leaf=0.0, n_estimators=2500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

# Random Forest Parametrization

In [0]:
### Trying Parameters:
n_estimators = [2000, 2500, 3000]
for n in n_estimators:
#  for c in criterion:
      tree = RandomForestClassifier(n_estimators=n, n_jobs=-1, verbose=2)
      print(n)
      tree.fit(X_train, y_train)
      prediction = tree.predict(X_test)
      print("Accuracy:", round(accuracy_score(y_test, prediction),2))
      print("Precision:", round(precision_score(y_test, prediction,average='weighted'),2))
      print(f"Recall: {round(recall_score(y_test, prediction,average='weighted'),2)}\n")

In [0]:
chosen_forest = RandomForestClassifier(n_estimators=1700, n_jobs=-1)
chosen_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1700,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

## DecisionTreeClassifier


In [0]:
### Trying Parameters:
criterion = ["gini", "entropy"]
splitter = ["best", "random"]
max_features = ["sqrt", "log2", "auto"]
for c in criterion:
  for s in splitter:
    for m in max_features:
      tree = DecisionTreeClassifier(criterion=c, splitter=s, max_features=m)
      print(c,s,m)
      tree.fit(X_train, y_train)
      prediction = tree.predict(X_test)
      print("Accuracy", accuracy_score(y_test, prediction))
      print("Precision", precision_score(y_test, prediction,average='weighted'))
      print(f"Recall: {recall_score(y_test, prediction,average='weighted')}\n")


gini best sqrt
Accuracy 0.9059900166389351
Precision 0.9072374454580207
Recall: 0.9059900166389351

gini best log2
Accuracy 0.9222129783693843
Precision 0.9222563655671967
Recall: 0.9222129783693843

gini best auto
Accuracy 0.9047420965058236
Precision 0.9053414751742787
Recall: 0.9047420965058236

gini random sqrt
Accuracy 0.7824459234608985
Precision 0.7837399083417508
Recall: 0.7824459234608985

gini random log2
Accuracy 0.9280366056572379
Precision 0.9286671706360168
Recall: 0.9280366056572379

gini random auto
Accuracy 0.8015806988352745
Precision 0.8032532912274916
Recall: 0.8015806988352745

entropy best sqrt
Accuracy 0.9450915141430949
Precision 0.9459196617717487
Recall: 0.9450915141430949

entropy best log2
Accuracy 0.9546589018302829
Precision 0.9549347337629362
Recall: 0.9546589018302829

entropy best auto
Accuracy 0.9330282861896838
Precision 0.933108697414389
Recall: 0.9330282861896838

entropy random sqrt
Accuracy 0.8544093178036606
Precision 0.8549918130022885
Recall: 0

In [0]:
tree = DecisionTreeClassifier(criterion="entropy", splitter="best", max_features="auto")
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [0]:
train_prediction = tree.predict(X_test)

In [0]:
print("Accuracy", accuracy_score(y_test, prediction))
print("Precision", precision_score(y_test, prediction,average='weighted'))
print("Recall", recall_score(y_test, prediction,average='weighted'))

Accuracy 0.9500831946755408
Precision 0.9505888816596746
Recall 0.9500831946755408


-----------------------------------------------------

###Upload Test File

In [0]:
test_up = files.upload()

Saving test_dataset.csv to test_dataset.csv


In [0]:
test_data = pd.read_csv(io.StringIO(test_up['test_dataset.csv'].decode('utf-8')))

In [0]:
X = test_data[["F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "F10"]]

In [0]:
# Change the model for the right one
y_predict = chosen_gradient.predict(X)

In [0]:
y_predict.shape

(8012,)

In [0]:
submit = pd.DataFrame({
    "id" : test_data['id'],
    "scribe" : y_predict
})

In [0]:
submit.head()

Unnamed: 0,id,scribe
0,0,Franciscus
1,1,Ubuntius
2,2,Noaelius
3,3,Marcus
4,4,Marcus


In [0]:
submit.to_csv("test_submit_1.csv", index=False)
files.download("test_submit_1.csv")