In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import statistics
from scipy import stats
import math

In [43]:
df = pd.read_csv('../../dados/credit_data.csv')

In [4]:
df.dropna(inplace=True)
df.shape

(1997, 5)

In [5]:
df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [6]:
X = df.iloc[:, 1:4].values
y = df.iloc[:, -1].values

In [29]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [30]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

naive_bayes = GaussianNB()
scores = cross_val_score(naive_bayes, X, y, cv=kfold)

In [31]:
scores.mean()

0.9249170854271356

In [32]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [33]:
naive = GaussianNB()
naive.fit(X_treino, y_treino)
previsoes = naive.predict(X_teste)
accuracy_score(previsoes, y_teste)

0.925

### Seleção de atributos utilizando variância

In [38]:
selecao = VarianceThreshold(threshold=0.027)

In [39]:
X_novo = selecao.fit_transform(X)
X_novo

array([[0.9231759 , 0.58883739],
       [0.28812165, 0.47682695],
       [0.74633429, 0.58262011],
       ...,
       [0.48612202, 0.40112895],
       [0.47500998, 0.1177903 ],
       [0.98881367, 0.53597028]])

In [37]:
np.var(X[0]), np.var(X[1]), np.var(X[2])

(0.027646353650092187, 0.05741515340722347, 0.028389480276199003)

In [40]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

naive_bayes = GaussianNB()
scores = cross_val_score(naive_bayes, X_novo, y, cv=kfold)

In [41]:
scores.mean()

0.8472763819095477

### Valores faltantes com média e moda

#### Média

In [44]:
df.isnull().sum()

i#clientid    0
income        0
age           3
loan          0
c#default     0
dtype: int64

In [45]:
nulos = df[df.isnull().any(axis=1)]
nulos

Unnamed: 0,i#clientid,income,age,loan,c#default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [46]:
df['age'].mean()

40.80755937840458

In [47]:
df['age'] = df['age'].replace(to_replace = np.nan, value = df['age'].mean())

In [48]:
df.isnull().sum()

i#clientid    0
income        0
age           0
loan          0
c#default     0
dtype: int64

#### Moda

In [50]:
df = pd.read_csv('../../dados/autos.csv', encoding = 'ISO-8859-1')
df.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [51]:
df.isnull().sum()

dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            37869
yearOfRegistration         0
gearbox                20209
powerPS                    0
model                  20484
kilometer                  0
monthOfRegistration        0
fuelType               33386
brand                      0
notRepairedDamage      72060
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64

In [52]:
df.shape

(371528, 20)

In [53]:
df['fuelType'].unique()

array(['benzin', 'diesel', nan, 'lpg', 'andere', 'hybrid', 'cng',
       'elektro'], dtype=object)

In [56]:
stats.mode(df['fuelType'])

ModeResult(mode=array(['benzin'], dtype=object), count=array([223857]))

In [57]:
statistics.mode(df['fuelType'])

'benzin'

In [58]:
df['fuelType'] = df['fuelType'].replace(to_replace=np.nan, value = statistics.mode(df['fuelType']))

In [59]:
df['fuelType'].unique()

array(['benzin', 'diesel', 'lpg', 'andere', 'hybrid', 'cng', 'elektro'],
      dtype=object)