In [51]:
import os 
import json
import numpy as np
import pandas as pd
import dill as pickle
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [46]:
data = pd.read_csv('./data/WarrantyData.csv')

data = data[[
 'DateTime',
 'DeviceCategory', 
 'Noofdevices']]
data['DateTime'] = pd.to_datetime(data['DateTime'], format='%Y-%m-%d')

data = data.copy()
data['Year'] = data['DateTime'].dt.year
data['Month'] = data['DateTime'].dt.month
data['Day'] = data['DateTime'].dt.day

del data['DateTime']

encoder = preprocessing.LabelEncoder()
encoder.fit(data['DeviceCategory'])
data['DeviceCategory'] = encoder.transform(data['DeviceCategory'])

data.head()

Unnamed: 0,DeviceCategory,Noofdevices,Year,Month,Day
0,1,42,2016,12,8
1,2,29,2016,12,8
2,4,33,2016,12,8
3,0,23,2016,12,8
4,3,0,2016,12,8


In [47]:
#Split Data
msk = np.random.rand(len(df)) < 0.8
traindata = data[msk]
testdata= data[~msk]
print('Traning Data => ',len(traindata))
print('Test Data => ',len(testdata))

Traning Data =>  2200
Test Data =>  540


In [83]:
feature_names = ['DeviceCategory', 'Year', 'Month', 'Day']
X_train = traindata[feature_names]
y_train = traindata['Noofdevices']

X_test = testdata[feature_names]
y_test = testdata['Noofdevices']    

    #apply scaling
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

classifier = BayesianRidge(compute_score=True)
y_pred = classifier.fit(X_train, y_train).predict(X_test)     
# print('Accuracy of Bayesian regression classifier on training set: {:.2f}'.format(classifier.score(X_train, y_train)))
# print('Accuracy of Bayesian regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)), end="\n\n")    
print(y_pred)
y_test.head()

[30.00463046 29.92203616 30.07830543 29.99571112 30.19327755 30.1519804
 30.14306106 30.29933033 30.38192464 30.4555996  30.40538311 30.63532735
 30.62640801 30.70008297 30.8976494  30.80613575 30.97132437 30.8474329
 31.00370218 30.95348568 31.25710489 31.3396992  31.2894827  31.51942694
 31.39553548 31.43683263 31.63439906 28.95924347 28.91794632 28.99162128
 28.98270194 29.14789056 29.13897121 29.13005187 29.3276183  29.31869896
 29.58994035 29.58102101 29.66361532 29.65469598 29.73729028 29.72837094
 29.68707379 29.84334306 30.03199015 30.02307081 30.10566511 30.13804293
 30.17934008 30.1615014  30.32669001 30.23517636 30.39144564 30.42382345
 30.72744266 30.64484835 30.67722616 30.71852331 30.87479259 30.83349543
 30.86587325 30.89825106 31.06343967 28.87931907 29.04450769 28.92061623
 29.0768855  28.99429119 29.17401894 29.66628523 29.70758238 29.77233801
 29.85493232 29.92860728 29.80471582 29.99336291 30.18200999 30.05811853
 30.29698211 30.24676562 30.3528184  30.50908767 30.5

6     12
7      5
11    52
12    39
15    36
Name: Noofdevices, dtype: int64

In [66]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [76]:
def build_and_train():           
    
    #declare features and label
    feature_names = ['DeviceCategory', 'Year', 'Month', 'Day']
    X_train = traindata[feature_names]
    y_train = traindata['Noofdevices']
    
    X_test = testdata[feature_names]
    y_test = testdata['Noofdevices']      
    
    
#     #apply scaling
#     from sklearn.preprocessing import MinMaxScaler
#     scaler = MinMaxScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_test = scaler.transform(X_test)
    
    #Bayesian Ridge Regression
    classifier = BayesianRidge(compute_score=True)
    y_pred = classifier.fit(X_train, y_train).predict(X_test)     
    print('Accuracy of Bayesian regression classifier on training set: {:.2f}'.format(classifier.score(X_train, y_train)))
    print('Accuracy of Bayesian regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)), end="\n\n")    
    
#     cnf_matrix = confusion_matrix(y_test, y_pred)

    # check confusion matrix and classification report (here for )            
#     plt.figure()
#     plot_confusion_matrix(cnf_matrix, classes=class_names,
#                           title='Confusion matrix, without normalization')
    
#     plt.show()
    X_test.head()
    return(classifier)
    
if __name__ == '__main__':
    model = build_and_train()

#     filename = './model/model_v1.pk'
#     with open(filename, 'wb') as file:
#         pickle.dump(model, file)