## Import Libraries

In [10]:
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
%matplotlib inline

---
## Get the Data

In [11]:
traffic = pd.read_csv('traffic_final.csv')
test = pd.read_csv('test_final.csv')

traffic.shape
traffic.head()

Unnamed: 0,incidents,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,N206,N310,R206,N105,IC5,N309,N101,EM579
0,0,0,0.323529,0.457143,0.289065,0.181818,1.0,2,0,0,0,0,0,0,0,1,0
1,0,0,0.323529,0.285714,0.289065,1.0,0.782609,7,0,0,0,0,0,0,0,1,0
2,1,2,0.382353,0.8,0.0,0.181818,0.652174,6,0,0,0,0,0,0,0,1,0
3,4,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,0,0,0,1,1,0,0,1,0
4,3,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,0,0,0,0,0,0,0,1,0


In [12]:
test.shape
test.head()

Unnamed: 0,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,N206,N310,IC5,R206,N105,N309,N101,EM579
0,2,0.464286,0.59375,0.0,0.272727,0.826087,3,0,0,0,0,0,0,0,1,0
1,0,0.5,0.59375,0.477121,0.818182,0.173913,4,0,0,0,0,0,0,0,1,0
2,2,0.714286,0.5625,0.0,0.545455,0.826087,1,0,0,0,0,0,0,0,1,0
3,2,0.571429,0.28125,0.60206,0.818182,0.652174,7,0,0,0,0,1,0,0,1,0
4,2,0.642857,0.71875,0.0,0.818182,0.434783,2,0,0,0,0,0,0,0,1,0


---
## Model Training

In [13]:
#divisão do target
x = traffic.drop(['incidents'], axis=1) # input features - everything except the incidents feature
y = traffic['incidents'].to_frame()                # target feature - incidents

x

Unnamed: 0,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N207,N206,N310,R206,N105,IC5,N309,N101,EM579
0,0,0.323529,0.457143,0.289065,0.181818,1.000000,2,0,0,0,0,0,0,0,1,0
1,0,0.323529,0.285714,0.289065,1.000000,0.782609,7,0,0,0,0,0,0,0,1,0
2,2,0.382353,0.800000,0.000000,0.181818,0.652174,6,0,0,0,0,0,0,0,1,0
3,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,0,0,0,1,1,0,0,1,0
4,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.352941,0.685714,0.000000,0.272727,0.000000,3,0,0,0,0,0,0,0,1,0
4996,2,0.529412,0.571429,0.000000,0.545455,0.608696,3,0,0,0,0,0,0,0,1,0
4997,0,0.352941,0.742857,0.458157,0.181818,0.130435,5,0,0,0,0,0,0,0,1,0
4998,0,0.323529,0.485714,0.458157,0.909091,0.260870,3,0,0,0,0,0,0,0,1,0


In [14]:
y

Unnamed: 0,incidents
0,0
1,0
2,1
3,4
4,3
...,...
4995,3
4996,0
4997,0
4998,0


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2411, random_state=2022)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1206 entries, 3419 to 922
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   luminosity        1206 non-null   int64  
 1   avg_temperature   1206 non-null   float64
 2   avg_atm_pressure  1206 non-null   float64
 3   avg_wind_speed    1206 non-null   float64
 4   Month             1206 non-null   float64
 5   Hour              1206 non-null   float64
 6   Day               1206 non-null   int64  
 7   N207              1206 non-null   int64  
 8   N206              1206 non-null   int64  
 9   N310              1206 non-null   int64  
 10  R206              1206 non-null   int64  
 11  N105              1206 non-null   int64  
 12  IC5               1206 non-null   int64  
 13  N309              1206 non-null   int64  
 14  N101              1206 non-null   int64  
 15  EM579             1206 non-null   int64  
dtypes: float64(5), int64(11)
memory usage: 1

---
### DecisionTree

In [18]:
print("**DecisionTreeClassifier**")
clf_Tree = DecisionTreeClassifier(random_state=2022)
clf_Tree.fit(X_train,Y_train)

print("Test Data...")
predictions_Tree = clf_Tree.predict(X_test)
print(predictions_Tree)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Tree,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**DecisionTreeClassifier**
Test Data...
[0 3 0 ... 2 4 3]
Training Data...
Cross Validation Accuracy: 0.6722 (+/- 0.0207)


---
### RandomForest

In [17]:
print("**RandomForestClassifier**")
clf_Forest = RandomForestClassifier(random_state=2022)
clf_Forest.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_Forest = clf_Forest.predict(X_test)
print(predictions_Forest)

print("Training Data...")
scores = cross_val_score(clf_Forest,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[0 3 0 ... 2 4 3]
Training Data...


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Cross Validation Accuracy: 0.7258 (+/- 0.0244)


In [16]:
print("**RandomForestClassifier**")
clf_Forest2 = RandomForestClassifier(n_estimators=500, random_state=2022, criterion='entropy')
clf_Forest2.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_Forest2 = clf_Forest2.predict(test)
print(predictions_Forest2)

print("Training Data...")
scores = cross_val_score(clf_Forest2,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[4 0 0 ... 0 3 3]
Training Data...


Feature names must be in the same order as they were in fit.

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Cross Validation Accuracy: 0.7318 (+/- 0.0217)


---
### AdaBoostClassifier

In [7]:
print("**AdaBoostClassifier**")
base = RandomForestClassifier(n_estimators=100, random_state=2022, criterion='entropy')
clf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf_Booster.fit(X_train,Y_train)

print("Test Data...")
predictions_Booster = clf_Booster.predict(X_test)
print(predictions_Booster)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Booster,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**AdaBoostClassifier**


  y = column_or_1d(y, warn=True)


Test Data...
[1 3 0 ... 0 4 4]
Training Data...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross Validation Accuracy: 0.7172 (+/- 0.0348)


---
### BaggingClassifier

In [8]:
print("**BaggingClassifier**")
clf_BG = BaggingClassifier(KNeighborsClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf_BG.fit(X_train,Y_train.values.ravel())

print("Test Data...")
predictions_BG = clf_BG.predict(X_test)
print(predictions_BG)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_BG,x,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**BaggingClassifier**
Test Data...
[0 3 0 ... 0 4 4]
Training Data...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross Validation Accuracy: 0.6454 (+/- 0.0234)


  y = column_or_1d(y, warn=True)


In [20]:
print("**Voting Classifier**")
clf1 = DecisionTreeClassifier(random_state=2022)
clf2 = RandomForestClassifier(n_estimators=100, random_state=2022, criterion='entropy')
clf3 = BaggingClassifier(KNeighborsClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf4 = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf5 = LogisticRegression(random_state=1)
clf6 = GaussianNB()

eclf = VotingClassifier(
     estimators=[('DecisionTree', clf_Tree), ('RandomForest', clf_Forest2), ('bag', clf_BG), 
                 ('boost', clf_Booster), ('lr', clf5), ('gnb', clf6)],
     voting='hard')

#evaluate the test dataset
#test_p = eclf.predict(X_test)
#test_acc = accuracy_score(Y_test,test_p)

for clf, label in zip( [clf_Tree, clf_Forest2, clf_BG, clf_Booster, clf5, clf6, eclf], ['Decision Tree', 
     'Random Forest', 'Bagging', 'Boosting', 'Logistic Regression', 'naive Bayes', 'Voting Ensemble']):
     scores = cross_val_score(clf, x, y, scoring='accuracy', cv=5)
     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

**Voting Classifier**
Accuracy: 0.67 (+/- 0.01) [Decision Tree]


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Accuracy: 0.72 (+/- 0.01) [Random Forest]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.64 (+/- 0.01) [Bagging]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.71 (+/- 0.01) [Boosting]


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

Accuracy: 0.51 (+/- 0.01) [Logistic Regression]
Accuracy: 0.47 (+/- 0.02) [naive Bayes]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown 

Accuracy: 0.68 (+/- 0.01) [Voting Ensemble]


In [21]:
from numpy import mean, std
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = RandomForestClassifier(random_state=2022)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
space['criterion'] = ['gini', 'entropy']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

result = search.fit(X_train,Y_train)
best_model = result.best_estimator_
predictions= best_model.predict(X_test)
# configure the cross-validation procedure
#predictions_final  = pd.DataFrame(predictions, columns = ["Speed_Diff"])
#print(predictions_final)
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, x, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Accuracy: 0.722 (0.015)


---
### Guardar os resultados num ficheiro csv 

In [22]:
# acrescentar headers
submission = pd.DataFrame(predictions, columns = ["Incidents"])
submission.insert(0, "RowId", range(1,1207), True)

#transformação dos valores para formato escrito
submission['Incidents']= submission['Incidents'].replace({0 : 'None', 1 : 'Low', 2 : 'Medium', 3 : 'High', 4 : 'Very_High'})

#passagem para ficheiro csv
submission.to_csv('submission.csv', index=False)

submission

Unnamed: 0,RowId,Incidents
0,1,
1,2,High
2,3,
3,4,
4,5,
...,...,...
1201,1202,
1202,1203,
1203,1204,Medium
1204,1205,Very_High
