In [5]:
import numpy as np 
import pandas as pd 

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
# There are three types of NB model.
from sklearn.datasets import fetch_covtype, fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn import metrics

In [3]:
prior = [0.45,0.3,0.15,0.1]
likelihood = [[0.3,0.3,0.4],[0.7,0.2,0.1],[0.15,0.5,0.35],[0.6,0.2,0.2]]

idx = 0 # This is indexing number.
for c,xs in zip(prior,likelihood): # Both objects have 4 elements.
    # zip function concatenate two different datasets.
    result = 1.0
    for x in xs:
        result *= x
    result *= c
    idx+=1
    print(f"{idx}th probability : {result}")

1th probability : 0.0162
2th probability : 0.0042
3th probability : 0.0039375
4th probability : 0.0024000000000000002


In [6]:
covtype = fetch_covtype() # we are fetching dataset from online.
# This data is related to the forest in the United States. 
# Classification problem (Multi-variate)
# Classifying the soil.
# Consists of 580000 samples.
print(covtype.DESCR)

.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30Ã—30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

    Classes                        7
    Samples total             581012
    Dimensionality                54
    Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``. If optional argument 'as_frame' is
set to 'True', it will return ``data`` and ``target

In [16]:
pd.DataFrame(covtype.data)
covtype.target # This is the label of each sample data.

array([5, 5, 2, ..., 3, 3, 3])

In [20]:
# Splitting training and test data.
covtype_X = covtype.data
covtype_Y = covtype.target
covtype_Xtrain, covtype_Xtest, covtype_Ytrain, covtype_Ytest = train_test_split(covtype_X,covtype_Y,test_size=0.2)
print(f'size of data : {covtype_X.shape}')
print(f'size of training dataset = {covtype_Xtrain.shape}')
print(f'size of test dataset = {covtype_Xtest.shape}')

size of data : (581012, 54)
size of training dataset = (464809, 54)
size of test dataset = (116203, 54)


In [23]:
covtype_df = pd.DataFrame(data=covtype_X)
covtype_df.describe()
# describe function will provide statistical description of the data.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,2959.365301,155.656807,14.103704,269.428217,46.418855,2350.146611,212.146049,223.318716,142.528263,1980.291226,...,0.044175,0.090392,0.077716,0.002773,0.003255,0.000205,0.000513,0.026803,0.023762,0.01506
std,279.984734,111.913721,7.488242,212.549356,58.295232,1559.25487,26.769889,19.768697,38.274529,1324.19521,...,0.205483,0.286743,0.267725,0.052584,0.056957,0.01431,0.022641,0.161508,0.152307,0.121791
min,1859.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2996.0,127.0,13.0,218.0,30.0,1997.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3858.0,360.0,66.0,1397.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
covtype_train_df = pd.DataFrame(data = covtype_Xtrain)
covtype_train_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,...,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0
mean,2959.576794,155.685847,14.099916,269.337326,46.418809,2350.432653,212.170928,223.33801,142.521292,1979.992943,...,0.044063,0.090745,0.077701,0.00279,0.003281,0.000215,0.000447,0.026893,0.02373,0.015041
std,279.790905,111.867186,7.485995,212.415502,58.190296,1559.683211,26.77605,19.751468,38.302085,1324.642919,...,0.205236,0.287246,0.2677,0.05275,0.057185,0.014666,0.021149,0.16177,0.152207,0.121714
min,1859.0,0.0,0.0,0.0,-166.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2996.0,127.0,13.0,218.0,30.0,1998.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3858.0,360.0,66.0,1390.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
covtype_test_df = pd.DataFrame(data = covtype_Xtest)
covtype_test_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,...,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0,116203.0
mean,2958.519333,155.540649,14.118852,269.791778,46.419043,2349.002453,212.046531,223.241543,142.556147,1981.484351,...,0.04462,0.088982,0.077778,0.002702,0.00315,0.000164,0.000775,0.026445,0.023889,0.015137
std,280.758321,112.100072,7.497234,213.084455,58.713347,1557.54652,26.745114,19.83736,38.164258,1322.40789,...,0.206469,0.284719,0.267823,0.051912,0.056034,0.012786,0.027819,0.160456,0.152705,0.1221
min,1866.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2808.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1025.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2995.0,127.0,13.0,218.0,29.0,1995.0,218.0,226.0,143.0,1712.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3163.0,261.0,18.0,390.0,68.0,3328.0,231.0,237.0,168.0,2552.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3851.0,360.0,62.0,1397.0,597.0,7082.0,254.0,254.0,253.0,7112.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
scaler = StandardScaler() # we are using standard scaler in this problem. 
# MinMax scaler and standard scaler are the common schemes for the scaling. 
covtype_Xtrain_scale = scaler.fit_transform(covtype_Xtrain)
covtype_Xtest_scale = scaler.transform(covtype_Xtest)

In [33]:
covtype_Xtrain_df = pd.DataFrame(covtype_Xtrain_scale)
covtype_Xtrain_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,...,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0
mean,-2.398341e-16,-2.4749280000000002e-17,1.027882e-16,1.22875e-16,-2.4061370000000003e-17,-1.029105e-16,-5.247795e-16,-5.24275e-16,-3.349025e-16,5.975598e-17,...,-5.220431e-18,-9.115500000000001e-17,1.900145e-17,-2.6598980000000002e-17,-2.060656e-17,1.0395e-18,1.8344119999999997e-19,2.8387530000000005e-17,7.031913e-18,5.01406e-17
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-3.933573,-1.391704,-1.883508,-1.267975,-3.65042,-1.506995,-7.923916,-11.30743,-3.720984,-1.494739,...,-0.2146959,-0.3159133,-0.2902529,-0.05289803,-0.05737349,-0.0146693,-0.02115883,-0.1662407,-0.155907,-0.1235729
25%,-0.5381767,-0.8732315,-0.6812617,-0.7595373,-0.6774128,-0.7978761,-0.5292395,-0.5234052,-0.6141002,-0.7216994,...,-0.2146959,-0.3159133,-0.2902529,-0.05289803,-0.05737349,-0.0146693,-0.02115883,-0.1662407,-0.155907,-0.1235729
50%,0.1301802,-0.256428,-0.14693,-0.2416838,-0.2821575,-0.2259645,0.2176975,0.1347744,0.01249825,-0.2038234,...,-0.2146959,-0.3159133,-0.2902529,-0.05289803,-0.05737349,-0.0146693,-0.02115883,-0.1662407,-0.155907,-0.1235729
75%,0.7270552,0.9324831,0.5209845,0.5398043,0.3880581,0.6267737,0.7032065,0.6916957,0.6652049,0.4303104,...,-0.2146959,-0.3159133,-0.2902529,-0.05289803,-0.05737349,-0.0146693,-0.02115883,-0.1662407,-0.155907,-0.1235729
max,3.211056,1.826401,6.932964,5.27581,9.530486,3.056116,1.562184,1.552392,2.910516,3.920311,...,4.657751,3.165425,3.445271,18.9043,17.42965,68.16957,47.2616,6.015374,6.414081,8.09239


In [34]:
# 20 News group data
# This data should be classified into news categories.
newsgroup = fetch_20newsgroups()
print(newsgroup.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [42]:
# We need more pre processing for the text data.
print(len(newsgroup.target_names)) 

20
