## Import Libraries

In [7]:
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import column_or_1d
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from numpy import mean, std
%matplotlib inline

---
## Get the Data

In [8]:
traffic = pd.read_csv('traffic_final.csv')
test = pd.read_csv('test_final.csv')

traffic.shape
traffic.head()

Unnamed: 0,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,incidents,Month,Hour,Day,Day_Part,affected_roads_encoded
0,0.0,0,0.323529,0.457143,0.289065,0,0.181818,1.0,2,3,1
1,0.001484,0,0.323529,0.285714,0.289065,0,1.0,0.782609,7,2,1
2,0.009919,2,0.382353,0.8,0.0,1,0.181818,0.652174,6,2,1
3,0.330219,2,0.411765,0.885714,0.289065,4,0.727273,0.391304,4,1,2
4,0.0,2,0.764706,0.657143,0.289065,3,0.454545,0.478261,1,1,1


In [9]:
test.shape
test.head()

Unnamed: 0,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,Day_Part,affected_roads_encoded
0,0.081461,2,0.464286,0.59375,0.0,0.272727,0.826087,3,3,1
1,0.0,0,0.5,0.59375,0.477121,0.818182,0.173913,4,0,1
2,0.0,2,0.714286,0.5625,0.0,0.545455,0.826087,1,3,1
3,0.009417,2,0.571429,0.28125,0.60206,0.818182,0.652174,7,2,2
4,0.0,2,0.642857,0.71875,0.0,0.818182,0.434783,2,1,1


---
## Model Training

In [10]:
#divisão do target
X = traffic.drop(['incidents'], axis=1) # input features - everything except the incidents feature
y = traffic['incidents']                # target feature - incidents

X

Unnamed: 0,delay_in_seconds,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,Day_Part,affected_roads_encoded
0,0.000000,0,0.323529,0.457143,0.289065,0.181818,1.000000,2,3,1
1,0.001484,0,0.323529,0.285714,0.289065,1.000000,0.782609,7,2,1
2,0.009919,2,0.382353,0.800000,0.000000,0.181818,0.652174,6,2,1
3,0.330219,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,1,2
4,0.000000,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,1,1
...,...,...,...,...,...,...,...,...,...,...
4995,0.000000,0,0.352941,0.685714,0.000000,0.272727,0.000000,3,0,1
4996,0.000000,2,0.529412,0.571429,0.000000,0.545455,0.608696,3,2,1
4997,0.000000,0,0.352941,0.742857,0.458157,0.181818,0.130435,5,0,1
4998,0.000000,0,0.323529,0.485714,0.458157,0.909091,0.260870,3,0,1


In [11]:
y


0       0
1       0
2       1
3       4
4       3
       ..
4995    3
4996    0
4997    0
4998    0
4999    0
Name: incidents, Length: 5000, dtype: int64

In [54]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=2022)
#X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 3419 to 4546
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   delay_in_seconds        1500 non-null   float64
 1   luminosity              1500 non-null   int64  
 2   avg_temperature         1500 non-null   float64
 3   avg_atm_pressure        1500 non-null   float64
 4   avg_wind_speed          1500 non-null   float64
 5   Month                   1500 non-null   float64
 6   Hour                    1500 non-null   float64
 7   Day                     1500 non-null   int64  
 8   Day_Part                1500 non-null   int64  
 9   affected_roads_encoded  1500 non-null   int64  
dtypes: float64(6), int64(4)
memory usage: 128.9 KB


---
### Logistic Regression

In [58]:
print("**LogisticRegressionClassifier**")
clf_Regresion = LogisticRegression(random_state=2022, solver='liblinear')
clf_Regresion.fit(X, y)

print("**Test Data...**")
predictions_Regresion = clf_Regresion.predict(test)
print(predictions_Regresion)

print("Traning Data...")
scores = cross_val_score(clf_Regresion, X, y, cv=10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**LogisticRegressionClassifier**
**Test Data...**
[3 0 0 ... 0 3 0]
Traning Data...
Cross Validation Accuracy: 0.5464 (+/- 0.0244)


---
### DecisionTree

In [33]:
print("**DecisionTreeClassifier**")
clf_Tree = DecisionTreeClassifier(random_state=2022)
clf_Tree.fit(X, y)

print("Test Data...")
predictions_Tree = clf_Tree.predict(test)
print(predictions_Tree)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Tree,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**DecisionTreeClassifier**
Test Data...
[4 0 0 ... 3 3 3]
Training Data...
Cross Validation Accuracy: 0.8848 (+/- 0.0347)


---
### RandomForest

In [34]:
print("**RandomForestClassifier**")
clf_Forest = RandomForestClassifier(random_state=2022)
clf_Forest.fit(X, y)

print("Test Data...")
predictions_Forest = clf_Forest.predict(test)
print(predictions_Forest)

print("Training Data...")
scores = cross_val_score(clf_Forest,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.8870 (+/- 0.0297)


In [21]:
print("**RandomForestClassifier**")
clf_Forest2 = RandomForestClassifier(n_estimators=500, random_state=2022, criterion='entropy')
clf_Forest2.fit(X, y)

print("Test Data...")
predictions_Forest2 = clf_Forest2.predict(test)
print(predictions_Forest2)

print("Training Data...")
scores = cross_val_score(clf_Forest2,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**RandomForestClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.8914 (+/- 0.0324)


---
### AdaBoostClassifier

In [11]:
'''
print("**AdaBoostClassifier**")
base = RandomForestClassifier(n_estimators=200, random_state=2022, criterion='entropy')
clf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)
clf_Booster.fit(X, y)

print("Test Data...")
predictions_Booster = clf_Booster.predict(test)
print(predictions_Booster)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Booster,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
'''

'\nprint("**AdaBoostClassifier**")\nbase = RandomForestClassifier(n_estimators=200, random_state=2022, criterion=\'entropy\')\nclf_Booster = AdaBoostClassifier(n_estimators=100, random_state=2022, base_estimator=base)\nclf_Booster.fit(X, y)\n\nprint("Test Data...")\npredictions_Booster = clf_Booster.predict(test)\nprint(predictions_Booster)\n\n#k cross val\nprint("Training Data...")\nscores = cross_val_score(clf_Booster,X,y,cv = 10)\nprint("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))\n'

---
### BaggingClassifier

In [36]:
print("**BaggingClassifier**")
clf_BG = BaggingClassifier(RandomForestClassifier(), random_state=2022, max_samples=0.5, max_features=0.5)
clf_BG.fit(X, y)

print("Test Data...")
predictions_BG = clf_BG.predict(test)
print(predictions_BG)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_BG,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**BaggingClassifier**
Test Data...
[3 0 0 ... 3 1 0]
Training Data...
Cross Validation Accuracy: 0.7484 (+/- 0.0194)


---
### Support Vector Machines

In [37]:
#Tuning of best parameters
#param_grid = {'C' : [0.1, 1, 10, 100, 1000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001], 'kernel' : ['rbf']}

#grid = GridSearchCV(SVC(random_state=2022), param_grid, refit=True, verbose=3)

#grid.fit(X_train, Y_train)

#grid.best_params_

#grid_predictions = grid.predict(X_test)

print("**SVC**")
clf_SVC = SVC(random_state=2022)
clf_SVC.fit(X, y)

print("Test Data...")
predictions_SVC = clf_SVC.predict(test)
print(predictions_SVC)

print("Training Data...")
scores = cross_val_score(clf_SVC,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**SVC**
Test Data...
[0 0 0 ... 0 0 0]
Training Data...
Cross Validation Accuracy: 0.4976 (+/- 0.0127)


---
### K-Nearest Neighbors

In [38]:
print("**KNeighborsClassifier**")
clf_KNN = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
clf_KNN.fit(X, y)

print("Test Data...")
predictions_KNN = clf_KNN.predict(test)
print(predictions_KNN)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_KNN,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**KNeighborsClassifier**
Test Data...
[4 0 0 ... 0 0 0]
Training Data...
Cross Validation Accuracy: 0.6474 (+/- 0.0329)


---
### CatBoost

In [20]:
print("**CatBoostClassifier**")
clf_Cat = CatBoostClassifier(random_state=2022)
clf_Cat.fit(X, y)

print("Test Data...")
predictions_Cat = clf_Cat.predict(test)
print(predictions_Cat)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Cat,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**CatBoostClassifier**
Learning rate set to 0.085896
0:	learn: 1.4196097	total: 2.9ms	remaining: 2.9s
1:	learn: 1.2964908	total: 6.46ms	remaining: 3.22s
2:	learn: 1.2028484	total: 9.52ms	remaining: 3.16s
3:	learn: 1.1297335	total: 12.3ms	remaining: 3.07s
4:	learn: 1.0689628	total: 15ms	remaining: 2.99s
5:	learn: 1.0096398	total: 18.7ms	remaining: 3.1s
6:	learn: 0.9590526	total: 21ms	remaining: 2.98s
7:	learn: 0.9165271	total: 23.4ms	remaining: 2.9s
8:	learn: 0.8804938	total: 26.1ms	remaining: 2.88s
9:	learn: 0.8502201	total: 28.7ms	remaining: 2.84s
10:	learn: 0.8252308	total: 31.3ms	remaining: 2.81s
11:	learn: 0.7982479	total: 33.9ms	remaining: 2.79s
12:	learn: 0.7713428	total: 36.3ms	remaining: 2.75s
13:	learn: 0.7500242	total: 39.1ms	remaining: 2.75s
14:	learn: 0.7314561	total: 41.8ms	remaining: 2.75s
15:	learn: 0.7141055	total: 44.4ms	remaining: 2.73s
16:	learn: 0.7000979	total: 46.8ms	remaining: 2.71s
17:	learn: 0.6885070	total: 48.9ms	remaining: 2.67s
18:	learn: 0.6783603	total: 5

---
### LGBMClassifier

In [40]:
print("**LGBMClassifier**")
clf_LGBM = LGBMClassifier(random_state=2022)
clf_LGBM.fit(X, y)

print("Test Data...")
predictions_LGBM = clf_LGBM.predict(test)
print(predictions_LGBM)

print("Training Data...")
scores = cross_val_score(clf_LGBM,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**LGBMClassifier**
Test Data...
[3 0 0 ... 3 3 3]
Training Data...
Cross Validation Accuracy: 0.9110 (+/- 0.0257)


---
### XGBoost

In [41]:
print("**XGBClassifier**")
clf_XGB = XGBClassifier(random_state=2022)
clf_XGB.fit(X, y)

print("Test Data...")
predictions_XGB = clf_XGB.predict(test)
print(predictions_XGB)

print("Training Data...")
scores = cross_val_score(clf_XGB,X,y,cv = 10)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**XGBClassifier**
Test Data...
[3 0 0 ... 3 1 3]
Training Data...
Cross Validation Accuracy: 0.9134 (+/- 0.0288)


In [17]:
#Tunning do Random Forest
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
# define the model
model = RandomForestClassifier(random_state=2022)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
space['criterion'] = ['gini', 'entropy']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

result = search.fit(X, y)
best_model = result.best_estimator_
predictions= best_model.predict(test)


cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# execute the nested cross-validation
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.913 (0.018)


In [None]:
# Tunning do CatBoost
# Inicialize o classificador CatBoost
model = CatBoostClassifier()

# Defina os valores dos hiperparâmetros a serem testados
param_grid = {'depth'         : [4,5,6,7,8,9, 10],
                 'learning_rate' : [0.01,0.02,0.03,0.04],
                  'iterations'    : [10, 20,30,40,50,60,70,80,90, 100]
                 }

# Crie o objeto de pesquisa em grade
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs=-1)

# Treine o modelo usando os dados de treinamento
result = grid_search.fit(X, y)


predictions= result.best_estimator_.predict(test)

# execute the nested cross-validation
scores = cross_val_score(grid_search, X, y, scoring='accuracy', cv=2, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [12]:
# Tunning do XGBoost
# Inicialize o classificador XGBoost
model = XGBClassifier()

# Defina os valores dos hiperparâmetros a serem testados
param_grid = {
    'n_estimators': [10, 100, 500],
    'max_depth': [1, 5, 10],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Crie o objeto de pesquisa em grade
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

# Treine o modelo usando os dados de treinamento
result = grid_search.fit(X, y)

predictions= result.best_estimator_.predict(test)

# execute the nested cross-validation
scores = cross_val_score(grid_search, X, y, scoring='accuracy', cv=10, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.921 (0.015)


In [15]:
# Tunning do LGBM
# Inicialize o classificador XGBoost
model = LGBMClassifier(random_state=2022)

# Defina os valores dos hiperparâmetros a serem testados
param_grid = {
    'n_estimators': [10, 100, 500],
    'max_depth': [1, 5, 10],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Crie o objeto de pesquisa em grade
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

# Treine o modelo usando os dados de treinamento
result = grid_search.fit(X, y)

predictions= result.best_estimator_.predict(test)

# execute the nested cross-validation
scores = cross_val_score(grid_search, X, y, scoring='accuracy', cv=10, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.913 (0.016)


---
### Guardar os resultados num ficheiro csv 

In [18]:
# acrescentar headers
submission = pd.DataFrame(predictions, columns = ["Incidents"]).head(1206)
submission.index.name = 'RowId'
submission.index += 1

#transformação dos valores para formato escrito
submission['Incidents']= submission['Incidents'].replace({0 : 'None', 1 : 'Low', 2 : 'Medium', 3 : 'High', 4 : 'Very_High'})

#passagem para ficheiro csv
submission.to_csv('sub_RF_Tunning_PDv2.csv', index=True, header=True)

submission

Unnamed: 0_level_0,Incidents
RowId,Unnamed: 1_level_1
1,High
2,
3,
4,Low
5,
...,...
1202,
1203,
1204,High
1205,Low
