In [1]:
from sklearn import model_selection
import pandas as pd
from sklearn import metrics
from sklearn.naive_bayes import  BernoulliNB, MultinomialNB, GaussianNB
from sklearn.preprocessing import OneHotEncoder
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

weather = pd.read_csv('weatherAUS.csv') 
weather = weather.replace(to_replace = ['No', 'Yes'], value = [0, 1]) #replacing strings with labels

#преобразование категориальных признаков
onehot_encoder = OneHotEncoder(sparse = False)
categorical_columns = pd.DataFrame(onehot_encoder.fit_transform(weather[['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']]))

weather = weather.drop(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], axis = 1)
weather = (weather - weather.min())/(weather.max() - weather.min())#Normalization
weather = weather.join(categorical_columns)

weather.dropna(axis = 0, inplace = True)

X = weather.drop(['RainTomorrow'], axis = 1)
y = weather[['RainTomorrow']]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.25, shuffle=False)

In [2]:
%%time
classifier = BernoulliNB()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print ('BernoulliNB:')
print ('accuracy: ',np.mean(prediction == np.ravel(y_test)))
print('precision: ',metrics.precision_score(y_test, prediction))
print('recall: ',metrics.recall_score(y_test, prediction))
print('f1: ',metrics.f1_score(y_test, prediction))

BernoulliNB:
accuracy:  0.7789024306272808
precision:  0.43925944712148113
recall:  0.6339677891654466
f1:  0.5189513108614232
Wall time: 146 ms


  return f(*args, **kwargs)


In [3]:
%%time
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print ('MultinomialNB:')
print ('accuracy: ',np.mean(prediction == np.ravel(y_test)))
print('precision: ',metrics.precision_score(y_test, prediction))
print('recall: ',metrics.recall_score(y_test, prediction))
print('f1: ',metrics.f1_score(y_test, prediction))

  return f(*args, **kwargs)


MultinomialNB:
accuracy:  0.7898505818357089
precision:  0.4632183908045977
recall:  0.7375549048316252
f1:  0.5690482914430952
Wall time: 60.8 ms


In [4]:
%%time
classifier = GaussianNB()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print ('GaussianNB:')
print ('accuracy: ',np.mean(prediction == np.ravel(y_test)))
print('precision: ',metrics.precision_score(y_test, prediction))
print('recall: ',metrics.recall_score(y_test, prediction))
print('f1: ',metrics.f1_score(y_test, prediction))

  return f(*args, **kwargs)


GaussianNB:
accuracy:  0.8202850650692006
precision:  0.5159853249475891
recall:  0.7207174231332357
f1:  0.6014050091631032
Wall time: 176 ms


In [5]:
%%time
classifier = LogisticRegression(penalty = 'l2', max_iter = 5000)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print ('LogisticRegression:')
print ('accuracy: ',np.mean(prediction == np.ravel(y_test)))
print('precision: ',metrics.precision_score(y_test, prediction))
print('recall: ',metrics.recall_score(y_test, prediction))
print('f1: ',metrics.f1_score(y_test, prediction))

  return f(*args, **kwargs)


LogisticRegression:
accuracy:  0.8801211870825587
precision:  0.7574025974025974
recall:  0.5336749633967789
f1:  0.6261541765084818
Wall time: 1.87 s


In [6]:
%%time
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train)
prediction = neigh.predict(X_test)
print ('KNeighborsClassifier:')
print ('accuracy: ',np.mean(prediction == np.ravel(y_test)))
print('precision: ',metrics.precision_score(y_test, prediction))
print('recall: ',metrics.recall_score(y_test, prediction))
print('f1: ',metrics.f1_score(y_test, prediction))

  return self._fit(X, y)


KNeighborsClassifier:
accuracy:  0.7997658885905116
precision:  0.4623287671232877
recall:  0.3953147877013177
f1:  0.4262036306235201
Wall time: 9.02 s


Выводы: Т.к. выборка не сбалансирована, модели отличаются достаточно большой точностью, но другие метрики показывают, что модели на самом деле не совершены. Наивные байесовские классификаторы отличаются быстрой работой, результаты зависят от типа признаков. Логистическая регрессия показывает наилучший результат из представленных с небольшим временем работы. Метод ближайших соседей самый медленный из рассматриваемых с не самым лучшим результатом. 