https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn

In [1]:
import pandas as pd
import numpy as np

#### Use naive Bayes

#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset


In [2]:
# Import train_test_split function, use model_selection instead of cross_validation
from sklearn.model_selection import train_test_split

# Import preprocessing modules
from sklearn import preprocessing

In [29]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

#### Datasets and dataframes are different. Convert wine dataset to a wine df below
##### np.c_ is the numpy concatenate function
##### which is used to concat wine['data'] and wine['target'] arrays 
##### for pandas column argument: concat wine['feature_names'] list
##### and string list (in this case one string); you can make this anything you'd like..  
##### the original dataset would probably call this ['something...']

In [8]:
# Bring in the file which has example input data with 5 columns for now
# Note this is not separated into 'test' or 'train' data
rdata = pd.read_csv('exampleinput.csv')
rdata.dtypes.head()  # just verifying type of values

year          int64
quarter       int64
yield       float64
fed_rate    float64
R             int64
dtype: object

In [9]:
# R is the target, pop it
r_value = rdata.pop('R').values

#### For now just working with one column of 'yield'

In [10]:
# see how many unique values are for yield
yvalues = rdata['yield'].value_counts()
# yvalues # 149 unique values

In [11]:
# convert this 1-d data to 2-d data
y_data = rdata[['yield']].copy()
# y_data.ndim # returns a value of 2

#### Import the estimator we want from the module it’s located in
#### Instantiate the estimator, possibly changing its defaults
#### Fit the estimator to the data. Possibly transform the data to its new space if need be

In [12]:
y_data_transformed = ohe.fit_transform(y_data)
# y_data_transformed        # Run and check the values

In [13]:
y_data_transformed.shape # 149 rows, 4 columns

(149, 4)

In [14]:
row0 = y_data_transformed[0]
# row0 # just verifying

In [15]:
y_data.values[0] # verifying value

array([ 0.92866667])

Let's analyze on basis of just one set of values

In [16]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(y_data_transformed, r_value, 
                                                    test_size=0.3,random_state=109)
# 70% training and 30% test

In [17]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)

In [18]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.644444444444


In [55]:
# So Yield Curve is a 64% accurate predictor of r?
# Let's try with fed funds rate...

#### Do transform for fed_rate

In [25]:
fed_data = rdata[['fed_rate']].copy()
fed_data.ndim # returns a value of 2

2

In [26]:
fed_data[:10]

Unnamed: 0,fed_rate
0,14.226667
1,14.513333
2,11.006667
3,9.286667
4,8.653333
5,8.803333
6,9.46
7,9.43
8,9.686667
9,10.556667


In [30]:
rdata = pd.read_csv('exampleinput.csv')

In [31]:
r_value = rdata.pop('R').values

In [32]:
fed_data_transformed = ohe.fit_transform(fed_data)

In [33]:
fed_data_transformed.shape # 149 rows, 4 columns

(149, 13)

In [34]:
fed_data_transformed

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       ..., 
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

In [35]:
A_train, A_test, b_train, b_test = train_test_split(fed_data_transformed, r_value, 
                                                    test_size=0.3,random_state=109)

In [36]:
#Train the model using the training sets
gnb.fit(A_train, b_train)

#Predict the response for test dataset
b_pred = gnb.predict(A_test)

In [37]:
print("Accuracy:",metrics.accuracy_score(b_test, b_pred))

Accuracy: 0.355555555556


#### Let's try BDI

In [42]:
bdata = pd.read_csv('bdi_data.csv')
bdata.dtypes.head()  # just verifying type of values

year     int64
month    int64
bdi      int64
r        int64
dtype: object

In [43]:
rforb = bdata.pop('r').values # read r values corresponding to bdi

In [45]:
bdi = bdata[['bdi']].copy()
bdi.ndim # returns a value of 2

2

In [46]:
bdi_transformed = ohe.fit_transform(bdi)

In [47]:
bdi_transformed.shape # 169 rows, 164 columns!!!!

(169, 164)

In [48]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
#Create a Gaussian Classifier
gnb = GaussianNB()

In [52]:
BDI_train, BDI_test, r_train, r_test = train_test_split(bdi_transformed, rforb, 
                                                    test_size=0.3,random_state=109)

In [53]:
#Train the model using the training sets
gnb.fit(BDI_train, r_train)

#Predict the response for test dataset
bdi_pred = gnb.predict(BDI_test)

In [54]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(r_test, bdi_pred))

Accuracy: 0.313725490196


In [None]:
#### BDI is only a 31% correct predictor of R

# IGNORE BELOW

#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
wine = datasets.load_wine()

wine_data = pd.DataFrame(data= np.c_[wine['data'], wine['target']],
                     columns= wine['feature_names'] + ['target'])

export_excel = wine_data.to_excel (r'wine_testing.xlsx', index = None, header=True)

# print the names of the 13 features
print("Features: ", wine.feature_names)

# print the label type of wine(class_0, class_1, class_2)
print("Labels: ", wine.target_names)
print(wine.data.shape)
print(wine.data[0:2])
print(wine.target)

# Import train_test_split function, use model_selection instead of cross_validation
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, 
                                                    test_size=0.3,random_state=109)
# 70% training and 30% test

#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)