## Import Libraries

In [238]:
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import column_or_1d
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from numpy import mean, std
%matplotlib inline

---
## Get the Data

In [239]:
traffic = pd.read_csv('traffic_final.csv')
test = pd.read_csv('test_final.csv')

traffic.shape
traffic.head()

Unnamed: 0,magnitude_of_delay,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,incidents,Month,Day,Day_Part,affected_roads_encoded
0,0,0.0,0,0.323529,0.457143,0.693147,0,3,2,3,0.6772
1,0,0.001484,0,0.323529,0.285714,0.693147,0,12,7,2,0.0042
2,0,0.009919,2,0.382353,0.8,0.0,1,3,6,2,0.6772
3,2,0.330219,2,0.411765,0.885714,0.693147,4,9,4,1,0.0002
4,0,0.0,2,0.764706,0.657143,0.693147,3,6,1,1,0.6772


In [240]:
test.shape
test.head()

Unnamed: 0,magnitude_of_delay,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Day,Day_Part,affected_roads_encoded
0,0,0.081461,2,0.464286,0.59375,0.0,4,3,3,0.045605
1,0,0.0,0,0.5,0.59375,1.098612,10,4,0,0.671642
2,0,0.0,2,0.714286,0.5625,0.0,7,1,3,0.671642
3,0,0.009417,2,0.571429,0.28125,1.386294,10,7,2,0.003317
4,0,0.0,2,0.642857,0.71875,0.0,10,2,1,0.671642


---
## Model Training

In [241]:
#divisão do target
X = traffic.drop(['incidents'], axis=1) # input features - everything except the incidents feature
y = traffic['incidents']                # target feature - incidents

X

Unnamed: 0,magnitude_of_delay,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Day,Day_Part,affected_roads_encoded
0,0,0.000000,0,0.323529,0.457143,0.693147,3,2,3,0.6772
1,0,0.001484,0,0.323529,0.285714,0.693147,12,7,2,0.0042
2,0,0.009919,2,0.382353,0.800000,0.000000,3,6,2,0.6772
3,2,0.330219,2,0.411765,0.885714,0.693147,9,4,1,0.0002
4,0,0.000000,2,0.764706,0.657143,0.693147,6,1,1,0.6772
...,...,...,...,...,...,...,...,...,...,...
4995,0,0.000000,0,0.352941,0.685714,0.000000,4,3,0,0.6772
4996,0,0.000000,2,0.529412,0.571429,0.000000,7,3,2,0.6772
4997,0,0.000000,0,0.352941,0.742857,1.098612,3,5,0,0.6772
4998,0,0.000000,0,0.323529,0.485714,1.098612,11,3,0,0.6772


In [242]:
y


0       0
1       0
2       1
3       4
4       3
       ..
4995    3
4996    0
4997    0
4998    0
4999    0
Name: incidents, Length: 5000, dtype: int64

In [236]:
from sklearn.feature_selection import SelectKBest
#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=2022)
#X_test.info()
# Selecione os k melhores atributos
selector = SelectKBest(k=7)
X = selector.fit_transform(X, y)

X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.000000,0.0,0.457143,3.0,3.0,0.6772
1,0.0,0.001484,0.0,0.285714,12.0,2.0,0.0042
2,0.0,0.009919,2.0,0.800000,3.0,2.0,0.6772
3,2.0,0.330219,2.0,0.885714,9.0,1.0,0.0002
4,0.0,0.000000,2.0,0.657143,6.0,1.0,0.6772
...,...,...,...,...,...,...,...
4995,0.0,0.000000,0.0,0.685714,4.0,0.0,0.6772
4996,0.0,0.000000,2.0,0.571429,7.0,2.0,0.6772
4997,0.0,0.000000,0.0,0.742857,3.0,0.0,0.6772
4998,0.0,0.000000,0.0,0.485714,11.0,0.0,0.6772


---
### Logistic Regression

In [187]:
print("**LogisticRegressionClassifier**")
clf_Regresion = LogisticRegression(random_state=2022, solver='liblinear')
clf_Regresion.fit(X, y)

print("**Test Data...**")
predictions_Regresion = clf_Regresion.predict(test)
print(predictions_Regresion)

print("Traning Data...")
scores = cross_val_score(clf_Regresion, X, y, cv=10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**LogisticRegressionClassifier**
**Test Data...**
[4 0 0 ... 4 3 0]
Traning Data...
Cross Validation Accuracy: 0.5418 (+/- 0.0166)


---
### DecisionTree

In [197]:
print("**DecisionTreeClassifier**")
clf_Tree = DecisionTreeClassifier(random_state=2022)
clf_Tree.fit(X, y)

print("Test Data...")
predictions_Tree = clf_Tree.predict(test)
print(predictions_Tree)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Tree,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**DecisionTreeClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.9034 (+/- 0.0128)


---
### RandomForest

In [198]:
print("**RandomForestClassifier**")
clf_Forest = RandomForestClassifier(random_state=2022)
clf_Forest.fit(X, y)

# Obtenha a importância de cada atributo
importances = clf_Forest.feature_importances_

# Imprima a importância de cada atributo
for feature, importance in zip(X, importances):
    print(feature, ":", importance)

print("Test Data...")
predictions_Forest = clf_Forest.predict(test)
print(predictions_Forest)

print("Training Data...")
scores = cross_val_score(clf_Forest,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**RandomForestClassifier**
magnitude_of_delay : 0.03472800243580868
delay_in_seconds : 0.2695355557000526
luminosity : 0.01826022830818463
avg_temperature : 0.08613341410858272
avg_atm_pressure : 0.10453389106043132
avg_wind_speed : 0.044248490416503486
Month : 0.26624303473589783
Day : 0.0664121467484278
Day_Part : 0.029172792737502673
affected_roads_encoded : 0.08073244374860819
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.9038 (+/- 0.0123)


In [117]:
print("**RandomForestClassifier**")
clf_Forest2 = RandomForestClassifier(n_estimators=500, random_state=2022, criterion='entropy')
clf_Forest2.fit(X, y)

print("Test Data...")
predictions_Forest2 = clf_Forest2.predict(test)
print(predictions_Forest2)

print("Training Data...")
scores = cross_val_score(clf_Forest2,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**RandomForestClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.9056 (+/- 0.0284)


---
### AdaBoostClassifier

In [134]:

print("**AdaBoostClassifier**")
base = RandomForestClassifier(n_estimators=100, random_state=2022, criterion='entropy')
clf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf_Booster.fit(X, y)

print("Test Data...")
predictions_Booster = clf_Booster.predict(test)
print(predictions_Booster)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Booster,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() ))

**AdaBoostClassifier**
Test Data...
[4 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.8498 (+/- 0.0429)


---
### BaggingClassifier

In [129]:
print("**BaggingClassifier**")
clf_BG = BaggingClassifier(RandomForestClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf_BG.fit(X, y)

print("Test Data...")
predictions_BG = clf_BG.predict(test)
print(predictions_BG)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_BG,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() ))

**BaggingClassifier**
Test Data...
[4 0 0 ... 3 1 0]
Training Data...
Cross Validation Accuracy: 0.8050 (+/- 0.0345)


---
### Support Vector Machines

In [130]:
#Tuning of best parameters
#param_grid = {'C' : [0.1, 1, 10, 100, 1000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001], 'kernel' : ['rbf']}

#grid = GridSearchCV(SVC(random_state=2022), param_grid, refit=True, verbose=3)

#grid.fit(X_train, Y_train)

#grid.best_params_

#grid_predictions = grid.predict(X_test)

print("**SVC**")
clf_SVC = SVC(random_state=2022)
clf_SVC.fit(X, y)

print("Test Data...")
predictions_SVC = clf_SVC.predict(test)
print(predictions_SVC)

print("Training Data...")
scores = cross_val_score(clf_SVC,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**SVC**
Test Data...
[3 0 0 ... 3 3 0]
Training Data...
Cross Validation Accuracy: 0.6748 (+/- 0.0344)


---
### K-Nearest Neighbors

In [131]:
print("**KNeighborsClassifier**")
clf_KNN = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
clf_KNN.fit(X, y)

print("Test Data...")
predictions_KNN = clf_KNN.predict(test)
print(predictions_KNN)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_KNN,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() ))

**KNeighborsClassifier**
Test Data...
[4 0 0 ... 1 0 0]
Training Data...
Cross Validation Accuracy: 0.7460 (+/- 0.0347)


---
### CatBoost

In [82]:
print("**CatBoostClassifier**")
clf_Cat = CatBoostClassifier(random_state=2022)
clf_Cat.fit(X, y)

print("Test Data...")
predictions_Cat = clf_Cat.predict(test)
print(predictions_Cat)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Cat,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**CatBoostClassifier**
Learning rate set to 0.085896
0:	learn: 1.4352876	total: 2.99ms	remaining: 2.99s
1:	learn: 1.3061583	total: 6.11ms	remaining: 3.05s
2:	learn: 1.2015904	total: 9.06ms	remaining: 3.01s
3:	learn: 1.1203572	total: 12.3ms	remaining: 3.06s
4:	learn: 1.0585334	total: 15.9ms	remaining: 3.15s
5:	learn: 1.0035036	total: 18.2ms	remaining: 3.01s
6:	learn: 0.9536877	total: 20.4ms	remaining: 2.9s
7:	learn: 0.9154469	total: 23.2ms	remaining: 2.88s
8:	learn: 0.8802192	total: 26ms	remaining: 2.86s
9:	learn: 0.8424585	total: 28.4ms	remaining: 2.81s
10:	learn: 0.8139577	total: 30.6ms	remaining: 2.75s
11:	learn: 0.7866513	total: 32.9ms	remaining: 2.71s
12:	learn: 0.7595238	total: 35.1ms	remaining: 2.66s
13:	learn: 0.7383187	total: 37.5ms	remaining: 2.64s
14:	learn: 0.7180480	total: 39.7ms	remaining: 2.61s
15:	learn: 0.7023416	total: 41.9ms	remaining: 2.58s
16:	learn: 0.6896498	total: 44ms	remaining: 2.54s
17:	learn: 0.6770717	total: 46.1ms	remaining: 2.52s
18:	learn: 0.6621846	total

---
### LGBMClassifier

In [199]:
print("**LGBMClassifier**")
clf_LGBM = LGBMClassifier(random_state=2022)
clf_LGBM.fit(X, y)

print("Test Data...")
predictions_LGBM = clf_LGBM.predict(test)
print(predictions_LGBM)

print("Training Data...")
scores = cross_val_score(clf_LGBM,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() ))

**LGBMClassifier**
Test Data...
[4 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.9196 (+/- 0.0113)


---
### XGBoost

In [200]:
print("**XGBClassifier**")
clf_XGB = XGBClassifier(random_state=2022)
clf_XGB.fit(X, y)

# Obtenha a importância de cada atributo
importances = clf_XGB.feature_importances_

# Imprima a importância de cada atributo
for feature, importance in zip(X, importances):
    print(feature, ":", importance)

print("Test Data...")
predictions_XGB = clf_XGB.predict(test)
print(predictions_XGB)

print("Training Data...")
scores = cross_val_score(clf_XGB,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

**XGBClassifier**
magnitude_of_delay : 0.10871685
delay_in_seconds : 0.33051327
luminosity : 0.027234672
avg_temperature : 0.035695817
avg_atm_pressure : 0.05416763
avg_wind_speed : 0.029879848
Month : 0.2826229
Day : 0.046305556
Day_Part : 0.026704194
affected_roads_encoded : 0.05815925
Test Data...
[4 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.9226 (+/- 0.0107)


In [180]:
#Tunning do Random Forest
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = RandomForestClassifier(random_state=2022, min_samples_leaf=10, min_samples_split=20)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
space['criterion'] = ['gini', 'entropy']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

result = search.fit(X, y)
best_model = result.best_estimator_
predictions= best_model.predict(test)


cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.872 (0.020)


In [243]:

# Tunning do XGBoost
# Inicialize o classificador XGBoost
model = XGBClassifier()

# Defina os valores dos hiperparâmetros a serem testados
param_grid = {
    'n_estimators': [10, 100, 500],
    'max_depth': [1, 5, 10],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Crie o objeto de pesquisa em grade
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)

# Treine o modelo usando os dados de treinamento
result = grid_search.fit(X, y)

predictions= result.best_estimator_.predict(test)

# execute the nested cross-validation
scores = cross_val_score(grid_search, X, y, scoring='accuracy', cv=10, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.924 (0.009)


In [111]:
# Tunning do LGBM
# Inicialize o classificador XGBoost
model = LGBMClassifier(random_state=2022)

# Defina os valores dos hiperparâmetros a serem testados
param_grid = {
    'n_estimators': [10, 100, 500],
    'max_depth': [1, 5, 10],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Crie o objeto de pesquisa em grade
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

# Treine o modelo usando os dados de treinamento
result = grid_search.fit(X, y)

predictions= result.best_estimator_.predict(test)

# execute the nested cross-validation
scores = cross_val_score(grid_search, X, y, scoring='accuracy', cv=10, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.919 (0.012)


---
### Guardar os resultados num ficheiro csv 

In [246]:
# acrescentar headers
submission = pd.DataFrame(predictions, columns = ["Incidents"]).head(1206)
submission.index.name = 'RowId'
submission.index += 1

#transformação dos valores para formato escrito
submission['Incidents']= submission['Incidents'].replace({0 : 'None', 1 : 'Low', 2 : 'Medium', 3 : 'High', 4 : 'Very_High'})

#passagem para ficheiro csv
submission.to_csv('sub_XGB_Tunning_7best_features.csv', index=True, header=True)

submission

Unnamed: 0_level_0,Incidents
RowId,Unnamed: 1_level_1
1,Very_High
2,
3,
4,Low
5,
...,...
1202,
1203,Low
1204,High
1205,Low
