## Import Libraries

In [33]:
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
%matplotlib inline

---
## Get the Data

In [38]:
traffic = pd.read_csv('traffic_final.csv')
test = pd.read_csv('test_final.csv')

traffic.shape
traffic.head()

Unnamed: 0,incidents,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N310,N206,N206.1,N101,N309,IC5,N207,IC5.1,EM579,N105,R206
0,0,0,0.323529,0.457143,0.289065,0.181818,1.0,2,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0.323529,0.285714,0.289065,1.0,0.782609,7,0,0,0,1,0,0,0,0,0,0,0
2,1,2,0.382353,0.8,0.0,0.181818,0.652174,6,0,0,0,1,0,0,0,0,0,0,0
3,4,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,0,0,0,1,0,0,0,0,0,1,1
4,3,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,0,0,0,1,0,0,0,0,0,0,0


In [37]:
test.shape
test.head()

Unnamed: 0,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N310,N206,N101,N206.1,N101.1,N309,IC5,N207,IC5.1,EM579,N105,N310.1,R206
0,2,0.464286,0.59375,0.0,0.272727,0.826087,3,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0.5,0.59375,0.477121,0.818182,0.173913,4,0,0,0,0,1,0,0,0,0,0,0,0,0
2,2,0.714286,0.5625,0.0,0.545455,0.826087,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,2,0.571429,0.28125,0.60206,0.818182,0.652174,7,0,0,0,0,1,0,0,0,0,0,0,0,1
4,2,0.642857,0.71875,0.0,0.818182,0.434783,2,0,0,0,0,1,0,0,0,0,0,0,0,0


---
## Model Training

In [16]:
#divisão do target
x = traffic.drop(['incidents'], axis=1) # input features - everything except the incidents feature
y = traffic['incidents']                # target feature - incidents

x

Unnamed: 0,luminosity,avg_temperature,avg_atm_pressure,avg_wind_speed,Month,Hour,Day,N310,N206,N206.1,N101,N309,IC5,N207,IC5.1,EM579,N105,R206
0,0,0.323529,0.457143,0.289065,0.181818,1.000000,2,0,0,0,1,0,0,0,0,0,0,0
1,0,0.323529,0.285714,0.289065,1.000000,0.782609,7,0,0,0,1,0,0,0,0,0,0,0
2,2,0.382353,0.800000,0.000000,0.181818,0.652174,6,0,0,0,1,0,0,0,0,0,0,0
3,2,0.411765,0.885714,0.289065,0.727273,0.391304,4,0,0,0,1,0,0,0,0,0,1,1
4,2,0.764706,0.657143,0.289065,0.454545,0.478261,1,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.352941,0.685714,0.000000,0.272727,0.000000,3,0,0,0,1,0,0,0,0,0,0,0
4996,2,0.529412,0.571429,0.000000,0.545455,0.608696,3,0,0,0,1,0,0,0,0,0,0,0
4997,0,0.352941,0.742857,0.458157,0.181818,0.130435,5,0,0,0,1,0,0,0,0,0,0,0
4998,0,0.323529,0.485714,0.458157,0.909091,0.260870,3,0,0,0,1,0,0,0,0,0,0,0


In [17]:
y

0       0
1       0
2       1
3       4
4       3
       ..
4995    3
4996    0
4997    0
4998    0
4999    0
Name: incidents, Length: 5000, dtype: int64

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2411, random_state=2022)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1206 entries, 3419 to 922
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   luminosity        1206 non-null   int64  
 1   avg_temperature   1206 non-null   float64
 2   avg_atm_pressure  1206 non-null   float64
 3   avg_wind_speed    1206 non-null   float64
 4   Month             1206 non-null   float64
 5   Hour              1206 non-null   float64
 6   Day               1206 non-null   int64  
 7   N310              1206 non-null   int64  
 8    N206             1206 non-null   int64  
 9   N206              1206 non-null   int64  
 10  N101              1206 non-null   int64  
 11  N309              1206 non-null   int64  
 12  IC5               1206 non-null   int64  
 13  N207              1206 non-null   int64  
 14  IC5               1206 non-null   int64  
 15  EM579             1206 non-null   int64  
 16  N105              1206 non-null   int64 

---
### DecisionTree

In [19]:
print("**DecisionTreeClassifier**")
clf_Tree = DecisionTreeClassifier(random_state=2022)
clf_Tree.fit(x,y)

print("Test Data...")
predictions_Tree = clf_Tree.predict(test)
print(predictions_Tree)

#k cross val
print("Training Data...")
scores = cross_val_score(clf_Tree,x,y)
print("Cross Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

**DecisionTreeClassifier**
Test Data...


Feature names unseen at fit time:
-  N310
- N101 



ValueError: X has 20 features, but DecisionTreeClassifier is expecting 18 features as input.