## Import Libraries

In [1]:
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.utils import column_or_1d
from sklearn.svm import SVC
from catboost import CatBoostClassifier
%matplotlib inline

---
## Get the Data

In [2]:
traffic = pd.read_csv('traffic_final.csv')
test = pd.read_csv('test_final.csv')

traffic.shape
traffic.head()

Unnamed: 0,incidents,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,EM579,N105,N310,N101,IC5,N206,N309,R206
0,0,0.0,0,0.323529,0.457143,0.289065,0.181818,1.0,2,0,0,0,0,1,0,0,0,0
1,0,0.001484,0,0.323529,0.285714,0.289065,1.0,0.782609,7,0,0,0,0,1,0,0,0,0
2,1,0.009919,2,0.382353,0.8,0.0,0.181818,0.652174,6,0,0,0,0,1,0,0,0,0
3,4,0.330219,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,0,0,1,0,1,0,0,0,1
4,3,0.0,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,0,0,0,0,1,0,0,0,0


In [3]:
test.shape
test.head()

Unnamed: 0,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,EM579,N105,N310,N101,IC5,N206,N309,R206
0,0.081461,2,0.464286,0.59375,0.0,0.272727,0.826087,3,0,0,0,0,1,0,0,0,0
1,0.0,0,0.5,0.59375,0.477121,0.818182,0.173913,4,0,0,0,0,1,0,0,0,0
2,0.0,2,0.714286,0.5625,0.0,0.545455,0.826087,1,0,0,0,0,1,0,0,0,0
3,0.009417,2,0.571429,0.28125,0.60206,0.818182,0.652174,7,0,0,0,0,1,0,0,0,1
4,0.0,2,0.642857,0.71875,0.0,0.818182,0.434783,2,0,0,0,0,1,0,0,0,0


---
## Model Training

In [4]:
#divisão do target
X = traffic.drop(['incidents'], axis=1) # input features - everything except the incidents feature
y = traffic['incidents']                # target feature - incidents

X

Unnamed: 0,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,EM579,N105,N310,N101,IC5,N206,N309,R206
0,0.000000,0,0.323529,0.457143,0.289065,0.181818,1.000000,2,0,0,0,0,1,0,0,0,0
1,0.001484,0,0.323529,0.285714,0.289065,1.000000,0.782609,7,0,0,0,0,1,0,0,0,0
2,0.009919,2,0.382353,0.800000,0.000000,0.181818,0.652174,6,0,0,0,0,1,0,0,0,0
3,0.330219,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,0,0,1,0,1,0,0,0,1
4,0.000000,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.000000,0,0.352941,0.685714,0.000000,0.272727,0.000000,3,0,0,0,0,1,0,0,0,0
4996,0.000000,2,0.529412,0.571429,0.000000,0.545455,0.608696,3,0,0,0,0,1,0,0,0,0
4997,0.000000,0,0.352941,0.742857,0.458157,0.181818,0.130435,5,0,0,0,0,1,0,0,0,0
4998,0.000000,0,0.323529,0.485714,0.458157,0.909091,0.260870,3,0,0,0,0,1,0,0,0,0


In [5]:
y


0       0
1       0
2       1
3       4
4       3
       ..
4995    3
4996    0
4997    0
4998    0
4999    0
Name: incidents, Length: 5000, dtype: int64

In [6]:
#X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=2022)
#X_test.info()

---
### Logistic Regression

In [7]:
print("**LogisticRegressionClassifier**")
clf_Regresion = LogisticRegression(random_state=2022, solver='liblinear')
clf_Regresion.fit(X, y)

print("**Test Data...**")
predictions_Regresion = clf_Regresion.predict(test)
print(predictions_Regresion)

print("Traning Data...")
scores = cross_val_score(clf_Regresion, X, y, cv=10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**LogisticRegressionClassifier**
**Test Data...**
[3 0 0 ... 0 3 0]
Traning Data...
Cross Validation Accuracy: 0.5516 (+/- 0.0228)


---
### DecisionTree

In [8]:
print("**DecisionTreeClassifier**")
clf_Tree = DecisionTreeClassifier(random_state=2022)
clf_Tree.fit(X, y)

print("Test Data...")
predictions_Tree = clf_Tree.predict(test)
print(predictions_Tree)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Tree,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**DecisionTreeClassifier**
Test Data...
[3 0 0 ... 3 3 3]
Training Data...
Cross Validation Accuracy: 0.8876 (+/- 0.0294)


---
### RandomForest

In [9]:
print("**RandomForestClassifier**")
clf_Forest = RandomForestClassifier(random_state=2022)
clf_Forest.fit(X, y)

print("Test Data...")
predictions_Forest = clf_Forest.predict(test)
print(predictions_Forest)

print("Training Data...")
scores = cross_val_score(clf_Forest,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.8844 (+/- 0.0241)


In [10]:
print("**RandomForestClassifier**")
clf_Forest2 = RandomForestClassifier(n_estimators=500, random_state=2022, criterion='entropy')
clf_Forest2.fit(X, y)

print("Test Data...")
predictions_Forest2 = clf_Forest2.predict(test)
print(predictions_Forest2)

print("Training Data...")
scores = cross_val_score(clf_Forest2,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.8864 (+/- 0.0306)


---
### AdaBoostClassifier

In [11]:
'''
print("**AdaBoostClassifier**")
base = RandomForestClassifier(n_estimators=200, random_state=2022, criterion='entropy')
clf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf_Booster.fit(X, y)

print("Test Data...")
predictions_Booster = clf_Booster.predict(test)
print(predictions_Booster)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Booster,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
'''

'\nprint("**AdaBoostClassifier**")\nbase = RandomForestClassifier(n_estimators=200, random_state=2022, criterion=\'entropy\')\nclf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)\nclf_Booster.fit(X, y)\n\nprint("Test Data...")\npredictions_Booster = clf_Booster.predict(test)\nprint(predictions_Booster)\n\n#k cross val\nprint("Training Data...")\nscores = cross_val_score(clf_Booster,X,y,cv = 10)\nprint("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))\n'

---
### BaggingClassifier

In [12]:
print("**BaggingClassifier**")
clf_BG = BaggingClassifier(KNeighborsClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf_BG.fit(X, y)

print("Test Data...")
predictions_BG = clf_BG.predict(test)
print(predictions_BG)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_BG,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**BaggingClassifier**
Test Data...
[3 0 0 ... 0 0 0]
Training Data...
Cross Validation Accuracy: 0.5866 (+/- 0.0362)


---
### Support Vector Machines

In [13]:
#Tuning of best parameters
#param_grid = {'C' : [0.1, 1, 10, 100, 1000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001], 'kernel' : ['rbf']}

#grid = GridSearchCV(SVC(random_state=2022), param_grid, refit=True, verbose=3)

#grid.fit(X_train, Y_train)

#grid.best_params_

#grid_predictions = grid.predict(X_test)

print("**SVC**")
clf_SVC = SVC(random_state=2022)
clf_SVC.fit(X, y)

print("Test Data...")
predictions_SVC = clf_SVC.predict(test)
print(predictions_SVC)

print("Training Data...")
scores = cross_val_score(clf_SVC,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**SVC**
Test Data...
[0 0 0 ... 0 0 0]
Training Data...
Cross Validation Accuracy: 0.5226 (+/- 0.0168)


---
### K-Nearest Neighbors

In [14]:
print("**KNeighborsClassifier**")
clf_KNN = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
clf_KNN.fit(X, y)

print("Test Data...")
predictions_KNN = clf_KNN.predict(test)
print(predictions_KNN)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_KNN,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**KNeighborsClassifier**
Test Data...
[2 0 0 ... 0 0 3]
Training Data...
Cross Validation Accuracy: 0.6586 (+/- 0.0377)


---
### CatBoost

In [15]:
print("**CatBoostClassifier**")
clf_Cat = CatBoostClassifier(random_state=2022)
clf_Cat.fit(X, y)

print("Test Data...")
predictions_Cat = clf_Cat.predict(test)
print(predictions_Cat)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Cat,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**CatBoostClassifier**
Learning rate set to 0.085896
0:	learn: 1.4440221	total: 50.5ms	remaining: 50.5s
1:	learn: 1.3125581	total: 54.5ms	remaining: 27.2s
2:	learn: 1.2095005	total: 58.4ms	remaining: 19.4s
3:	learn: 1.1315948	total: 61.9ms	remaining: 15.4s
4:	learn: 1.0612789	total: 65.5ms	remaining: 13s
5:	learn: 1.0052562	total: 69.4ms	remaining: 11.5s
6:	learn: 0.9531968	total: 73.2ms	remaining: 10.4s
7:	learn: 0.9111206	total: 76.5ms	remaining: 9.49s
8:	learn: 0.8796809	total: 79.7ms	remaining: 8.78s
9:	learn: 0.8499186	total: 83.1ms	remaining: 8.22s
10:	learn: 0.8229186	total: 86ms	remaining: 7.73s
11:	learn: 0.7970748	total: 89ms	remaining: 7.33s
12:	learn: 0.7759791	total: 91.9ms	remaining: 6.98s
13:	learn: 0.7570196	total: 94.7ms	remaining: 6.67s
14:	learn: 0.7384394	total: 97.6ms	remaining: 6.41s
15:	learn: 0.7186295	total: 100ms	remaining: 6.18s
16:	learn: 0.6990424	total: 104ms	remaining: 5.99s
17:	learn: 0.6878530	total: 107ms	remaining: 5.82s
18:	learn: 0.6749122	total: 11

In [None]:
print("**Voting Classifier**")
clf1 = DecisionTreeClassifier(random_state=2022)
clf2 = RandomForestClassifier(n_estimators=100, random_state=2022, criterion='entropy')
clf3 = BaggingClassifier(KNeighborsClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf4 = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf5 = LogisticRegression(random_state=1)
clf6 = GaussianNB()

eclf = VotingClassifier(
     estimators=[('DecisionTree', clf_Tree), ('RandomForest', clf_Forest2), ('bag', clf_BG), 
                 ('boost', clf_Booster), ('lr', clf5), ('gnb', clf6)],
     voting='hard')

#evaluate the test dataset
#test_p = eclf.predict(X_test)
#test_acc = accuracy_score(Y_test,test_p)

for clf, label in zip( [clf_Tree, clf_Forest2, clf_BG, clf_Booster, clf5, clf6, eclf], ['Decision Tree', 
     'Random Forest', 'Bagging', 'Boosting', 'Logistic Regression', 'naive Bayes', 'Voting Ensemble']):
     scores = cross_val_score(clf, x, y, scoring='accuracy', cv=5)
     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
from numpy import mean, std
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = RandomForestClassifier(random_state=2022)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
space['criterion'] = ['gini', 'entropy']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

result = search.fit(X_train,Y_train)
best_model = result.best_estimator_
predictions= best_model.predict(X_test)
# configure the cross-validation procedure
#predictions_final  = pd.DataFrame(predictions, columns = ["Incidents"])
#print(predictions_final)
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, x, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [6]:
from numpy import mean, std
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = CatBoostClassifier(random_state=2022)
# Defina os valores dos hiperparâmetros a serem testados
param_grid = {
    'learning_rate': [0.1, 0.5, 1],
    'depth': [3, 4, 5]
}
# define search
search = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', n_jobs=1, cv=5, refit=True)

result = search.fit(X, y)
best_model = result.best_estimator_
predictions= best_model.predict(test)
# configure the cross-validation procedure
#predictions_final  = pd.DataFrame(predictions, columns = ["Incidents"])
#print(predictions_final)
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

0:	learn: 1.4436481	total: 49.9ms	remaining: 49.9s
1:	learn: 1.3201967	total: 51.9ms	remaining: 25.9s
2:	learn: 1.2341242	total: 53.6ms	remaining: 17.8s
3:	learn: 1.1592508	total: 55.5ms	remaining: 13.8s
4:	learn: 1.1000435	total: 57.3ms	remaining: 11.4s
5:	learn: 1.0521145	total: 59ms	remaining: 9.77s
6:	learn: 1.0130643	total: 60.8ms	remaining: 8.62s
7:	learn: 0.9778298	total: 62.6ms	remaining: 7.76s
8:	learn: 0.9485041	total: 64.2ms	remaining: 7.07s
9:	learn: 0.9286947	total: 65.8ms	remaining: 6.51s
10:	learn: 0.9029714	total: 67.5ms	remaining: 6.07s
11:	learn: 0.8865701	total: 69.3ms	remaining: 5.7s
12:	learn: 0.8571813	total: 71.1ms	remaining: 5.4s
13:	learn: 0.8326859	total: 72.9ms	remaining: 5.13s
14:	learn: 0.8164786	total: 74.5ms	remaining: 4.89s
15:	learn: 0.7988364	total: 76.1ms	remaining: 4.68s
16:	learn: 0.7816462	total: 77.7ms	remaining: 4.49s
17:	learn: 0.7707866	total: 79.3ms	remaining: 4.33s
18:	learn: 0.7547676	total: 81.1ms	remaining: 4.19s
19:	learn: 0.7435824	total

---
### Guardar os resultados num ficheiro csv 

In [55]:
# acrescentar headers
submission = pd.DataFrame(predictions_Cat, columns = ["Incidents"]).head(1206)
submission.index.name = 'RowId'
submission.index += 1

#transformação dos valores para formato escrito
submission['Incidents']= submission['Incidents'].replace({0 : 'None', 1 : 'Low', 2 : 'Medium', 3 : 'High', 4 : 'Very_High'})

#passagem para ficheiro csv
submission.to_csv('sub_CatBoost_drop_mod.csv', index=True, header=True)

submission

Unnamed: 0_level_0,Incidents
RowId,Unnamed: 1_level_1
1,Very_High
2,
3,
4,Low
5,
...,...
1202,
1203,Low
1204,High
1205,Low
