## Import Libraries

In [1]:
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.utils import column_or_1d
from sklearn.svm import SVC
%matplotlib inline

---
## Get the Data

In [2]:
traffic = pd.read_csv('traffic_final.csv')
test = pd.read_csv('test_final.csv')

traffic.shape
traffic.head()

Unnamed: 0,incidents,magnitude_of_delay,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,R206,N105,N206,N310,IC5,EM579,N101,N309
0,0,0,0.0,0,0.323529,0.457143,0.1,0.181818,1.0,2,0,0,0,0,0,0,0,1,0
1,0,0,0.001484,0,0.323529,0.285714,0.1,1.0,0.782609,7,0,0,0,0,0,0,0,1,0
2,1,0,0.009919,2,0.382353,0.8,0.0,0.181818,0.652174,6,0,0,0,0,0,0,0,1,0
3,4,1,0.330219,2,0.411765,0.885714,0.1,0.727273,0.391304,4,0,1,1,0,0,0,0,1,0
4,3,0,0.0,2,0.764706,0.657143,0.1,0.454545,0.478261,1,0,0,0,0,0,0,0,1,0


In [3]:
test.shape
test.head()

Unnamed: 0,magnitude_of_delay,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,R206,N105,N206,N310,IC5,EM579,N101,N309
0,0,0.081461,2,0.464286,0.59375,0.0,0.272727,0.826087,3,0,0,0,0,0,0,0,1,0
1,0,0.0,0,0.5,0.59375,0.222222,0.818182,0.173913,4,0,0,0,0,0,0,0,1,0
2,0,0.0,2,0.714286,0.5625,0.0,0.545455,0.826087,1,0,0,0,0,0,0,0,1,0
3,0,0.009417,2,0.571429,0.28125,0.333333,0.818182,0.652174,7,0,1,0,0,0,0,0,1,0
4,0,0.0,2,0.642857,0.71875,0.0,0.818182,0.434783,2,0,0,0,0,0,0,0,1,0


---
## Model Training

In [4]:
#divisão do target
x = traffic.drop(['incidents'], axis=1) # input features - everything except the incidents feature
y = traffic['incidents']             # target feature - incidents

x

Unnamed: 0,magnitude_of_delay,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,R206,N105,N206,N310,IC5,EM579,N101,N309
0,0,0.000000,0,0.323529,0.457143,0.1,0.181818,1.000000,2,0,0,0,0,0,0,0,1,0
1,0,0.001484,0,0.323529,0.285714,0.1,1.000000,0.782609,7,0,0,0,0,0,0,0,1,0
2,0,0.009919,2,0.382353,0.800000,0.0,0.181818,0.652174,6,0,0,0,0,0,0,0,1,0
3,1,0.330219,2,0.411765,0.885714,0.1,0.727273,0.391304,4,0,1,1,0,0,0,0,1,0
4,0,0.000000,2,0.764706,0.657143,0.1,0.454545,0.478261,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.000000,0,0.352941,0.685714,0.0,0.272727,0.000000,3,0,0,0,0,0,0,0,1,0
4996,0,0.000000,2,0.529412,0.571429,0.0,0.545455,0.608696,3,0,0,0,0,0,0,0,1,0
4997,0,0.000000,0,0.352941,0.742857,0.2,0.181818,0.130435,5,0,0,0,0,0,0,0,1,0
4998,0,0.000000,0,0.323529,0.485714,0.2,0.909091,0.260870,3,0,0,0,0,0,0,0,1,0


In [5]:
y


0       0
1       0
2       1
3       4
4       3
       ..
4995    3
4996    0
4997    0
4998    0
4999    0
Name: incidents, Length: 5000, dtype: int64

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2411, random_state=2022)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1206 entries, 3419 to 922
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   magnitude_of_delay  1206 non-null   int64  
 1   delay_in_seconds    1206 non-null   float64
 2   luminosity          1206 non-null   int64  
 3   avg_temperature     1206 non-null   float64
 4   avg_atm_pressure    1206 non-null   float64
 5   avg_wind_speed      1206 non-null   float64
 6   Month               1206 non-null   float64
 7   Hour                1206 non-null   float64
 8   Day                 1206 non-null   int64  
 9   N207                1206 non-null   int64  
 10  R206                1206 non-null   int64  
 11  N105                1206 non-null   int64  
 12  N206                1206 non-null   int64  
 13  N310                1206 non-null   int64  
 14  IC5                 1206 non-null   int64  
 15  EM579               1206 non-null   int64  
 16  N101

---
### Logistic Regression

In [8]:
print("**LogisticRegressionClassifier**")
clf_Regresion = LogisticRegression(random_state=2022, solver='liblinear')
clf_Regresion.fit(X_train, Y_train)

print("**Test Data...**")
predictions_Regresion = clf_Regresion.predict(X_test)
print(predictions_Regresion)

print("Traning Data...")
scores = cross_val_score(clf_Regresion, x, y, cv=10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**LogisticRegressionClassifier**
**Test Data...**
[0 3 0 ... 3 4 4]
Traning Data...
Cross Validation Accuracy: 0.5678 (+/- 0.0287)


---
### DecisionTree

In [9]:
print("**DecisionTreeClassifier**")
clf_Tree = DecisionTreeClassifier(random_state=2022)
clf_Tree.fit(X_train,Y_train)

print("Test Data...")
predictions_Tree = clf_Tree.predict(X_test)
print(predictions_Tree)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Tree,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**DecisionTreeClassifier**
Test Data...
[1 3 0 ... 2 4 4]
Training Data...
Cross Validation Accuracy: 0.8932 (+/- 0.0327)


---
### RandomForest

In [10]:
print("**RandomForestClassifier**")
clf_Forest = RandomForestClassifier(random_state=2022)
clf_Forest.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_Forest = clf_Forest.predict(X_test)
print(predictions_Forest)

print("Training Data...")
scores = cross_val_score(clf_Forest,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[1 4 0 ... 0 4 4]
Training Data...
Cross Validation Accuracy: 0.8860 (+/- 0.0314)


In [11]:
print("**RandomForestClassifier**")
clf_Forest2 = RandomForestClassifier(n_estimators=500, random_state=2022, criterion='entropy')
clf_Forest2.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_Forest2 = clf_Forest2.predict(X_test)
print(predictions_Forest2)

print("Training Data...")
scores = cross_val_score(clf_Forest2,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[1 3 0 ... 0 4 4]
Training Data...
Cross Validation Accuracy: 0.8894 (+/- 0.0328)


---
### AdaBoostClassifier

In [46]:
print("**AdaBoostClassifier**")
base = RandomForestClassifier(n_estimators=500, random_state=2022, criterion='entropy')
clf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf_Booster.fit(X_train,Y_train)

print("Test Data...")
predictions_Booster = clf_Booster.predict(X_test)
print(predictions_Booster)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Booster,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**AdaBoostClassifier**
Test Data...
[1 1 0 ... 0 4 4]
Training Data...


---
### BaggingClassifier

In [12]:
print("**BaggingClassifier**")
clf_BG = BaggingClassifier(KNeighborsClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf_BG.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_BG = clf_BG.predict(X_test)
print(predictions_BG)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_BG,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**BaggingClassifier**
Test Data...
[0 3 0 ... 0 4 4]
Training Data...
Cross Validation Accuracy: 0.7044 (+/- 0.0336)


---
### Support Vector Machines

In [13]:
#Tuning of best parameters
#param_grid = {'C' : [0.1, 1, 10, 100, 1000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001], 'kernel' : ['rbf']}

#grid = GridSearchCV(SVC(random_state=2022), param_grid, refit=True, verbose=3)

#grid.fit(X_train, Y_train)

#grid.best_params_

#grid_predictions = grid.predict(X_test)

print("**SVC**")
clf_SVC = SVC(C=1, gamma=0.0001, kernel='rbf', random_state=2022)
clf_SVC.fit(X_train,Y_train)

print("Test Data...")
predictions_SVC = clf_SVC.predict(X_test)
print(predictions_SVC)

print("Training Data...")
scores = cross_val_score(clf_SVC,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**SVC**
Test Data...
[0 0 0 ... 0 0 0]
Training Data...
Cross Validation Accuracy: 0.4056 (+/- 0.0016)


---
### K-Nearest Neighbors

In [14]:
print("**KNeighborsClassifier**")
clf_KNN = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
clf_KNN.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_KNN = clf_KNN.predict(X_test)
print(predictions_KNN)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_KNN,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**KNeighborsClassifier**
Test Data...
[1 0 1 ... 0 4 4]
Training Data...
Cross Validation Accuracy: 0.6956 (+/- 0.0265)


In [None]:
print("**Voting Classifier**")
clf1 = DecisionTreeClassifier(random_state=2022)
clf2 = RandomForestClassifier(n_estimators=100, random_state=2022, criterion='entropy')
clf3 = BaggingClassifier(KNeighborsClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf4 = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf5 = LogisticRegression(random_state=1)
clf6 = GaussianNB()

eclf = VotingClassifier(
     estimators=[('DecisionTree', clf_Tree), ('RandomForest', clf_Forest2), ('bag', clf_BG), 
                 ('boost', clf_Booster), ('lr', clf5), ('gnb', clf6)],
     voting='hard')

#evaluate the test dataset
#test_p = eclf.predict(X_test)
#test_acc = accuracy_score(Y_test,test_p)

for clf, label in zip( [clf_Tree, clf_Forest2, clf_BG, clf_Booster, clf5, clf6, eclf], ['Decision Tree', 
     'Random Forest', 'Bagging', 'Boosting', 'Logistic Regression', 'naive Bayes', 'Voting Ensemble']):
     scores = cross_val_score(clf, x, y, scoring='accuracy', cv=5)
     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
from numpy import mean, std
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = RandomForestClassifier(random_state=2022)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
space['criterion'] = ['gini', 'entropy']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

result = search.fit(X_train,Y_train)
best_model = result.best_estimator_
predictions= best_model.predict(X_test)
# configure the cross-validation procedure
#predictions_final  = pd.DataFrame(predictions, columns = ["Speed_Diff"])
#print(predictions_final)
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, x, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

---
### Guardar os resultados num ficheiro csv 

In [15]:
# acrescentar headers
submission = pd.DataFrame(predictions_Forest2, columns = ["Incidents"]).head(1206)
#submission.insert(0, 'RowId', range(1,(len(submission)+1)))
submission.index.name = 'RowId'
submission.index += 1

#transformação dos valores para formato escrito
submission['Incidents']= submission['Incidents'].replace({0 : 'None', 1 : 'Low', 2 : 'Medium', 3 : 'High', 4 : 'Very_High'})

#passagem para ficheiro csv
submission.to_csv('submission.csv', index=True, header=True)

submission

Unnamed: 0_level_0,Incidents
RowId,Unnamed: 1_level_1
1,Low
2,High
3,
4,
5,
...,...
1202,
1203,
1204,
1205,Very_High
