In [1]:
#https://medium.com/datadriveninvestor/choosing-the-best-algorithm-for-your-classification-model-7c632c78f38f
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
#https://www.dataquest.io/blog/sci-kit-learn-tutorial/
#https://datascience.stackexchange.com/questions/33256/how-to-apply-machine-learning-model-to-new-dataset

In [2]:
from fastai.tabular import *
import os
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn import svm, tree
import xgboost
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

from sklearn import model_selection
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
#from yellowbrick.classifier import ClassificationReport


In [3]:
path = Path('/Users/bbabu/fastai/DataSamples')

In [4]:
#For V3
data = pd.read_csv(path/'Output-5v4.csv', sep=',', header=0)


In [5]:
le = preprocessing.LabelEncoder()

#convert the categorical columns into numeric
data['GroupDN'] = le.fit_transform(data['GroupDN'])
data['Member'] = le.fit_transform(data['Member'])
data['BusinessGroup'] = le.fit_transform(data['BusinessGroup'])
data['Platform'] = le.fit_transform(data['Platform'])
#data['Title'] = le.fit_transform(data['Title'])
#data['Department'] = le.fit_transform(data['Department'])
data['Country'] = le.fit_transform(data['Country'])
#data['State'] = le.fit_transform(data['State'])
data['City'] = le.fit_transform(data['City'])
#display the initial records
#data.head()

In [6]:
#Create Dependent and Independent Datasets based on our Dependent #and Independent features
X = data[['GroupDN', 'Member', 'Manager', 'BusinessGroup', 'Platform', 'Country', 'City']]
y = data['Owner']
#Split the Data into Training and Testing sets with test size as #30%
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.5, shuffle=True)

In [7]:
classifiers = [
    xgboost.XGBClassifier(),
    svm.SVC(kernel="linear", C=0.025),
    svm.SVC(gamma=2, C=1),
    tree.DecisionTreeClassifier(max_depth=5),
    GaussianNB(priors=None, var_smoothing=1e-01),
    ComplementNB(alpha=3.0, class_prior=None, fit_prior=False, norm=False),
    KNeighborsClassifier(n_neighbors=3),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=6),]

In [8]:
model1 = xgboost.XGBClassifier()
classifiers.append(model1)

model2 = svm.SVC(kernel="linear", C=0.025, gamma=2)
classifiers.append(model2)

model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)

model4 = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=6)
classifiers.append(model4)

model5 = GaussianNB(priors=None, var_smoothing=1e-01)
classifiers.append(model5)

model6 = KNeighborsClassifier()
classifiers.append(model6)

model7 = ComplementNB(alpha=3.0, class_prior=None, fit_prior=False, norm=False)
classifiers.append(model7)

In [9]:
for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of %s is %s"%(clf, acc))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix of %s is %s"%(clf, cm))

Accuracy of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1) is 0.9811965811965812
Confusion Matrix of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1) is [[5

Accuracy of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') is 0.9811965811965812
Confusion Matrix of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') is [[572   1]
 [ 10   2]]
Accuracy of ComplementNB(alpha=3.0, class_prior=None, fit_prior=False, norm=False) is 0.5675213675213675
Confusion Matrix of ComplementNB(alpha=3.0, class_prior=None, fit_prior=False, norm=False) is [[325 248]
 [  5   7]]


In [10]:
import pickle
#cart_model = tree.DecisionTreeClassifier() #may be
cart_model = GaussianNB(priors=[0.95, 0.05], var_smoothing=1e-01) #le-01 is final, this is better
#cart_model = GaussianNB(priors=None, var_smoothing=1e-01) #le-01 is final
#cart_model = ComplementNB(alpha=3.0, class_prior=None, fit_prior=True, norm=False) #do not use
#cart_model  = KNeighborsClassifier() #Do not use
# cart_model = RandomForestClassifier(max_depth=10, n_estimators=10, max_features=6, verbose=1) #may be
#cart_model = xgboost.XGBClassifier() #Do not use
#cart_model = svm.SVC(kernel="linear", C=0.025, gamma=2) #Do not use
cart_model.fit(X_train, y_train)

GaussianNB(priors=[0.95, 0.05], var_smoothing=0.1)

In [11]:
# Save model to disk
filename = 'Final_Model-Multi.sav'
pickle.dump(cart_model, open(filename, 'wb'))

In [12]:
# Load model from disk and use it to make new predictions
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
#print(result)

In [13]:
# Load test dataset
#final_predict = np.loadtxt(path/'Output-6-Test-V3.csv', delimiter=",")
df_test = pd.read_csv(path/'Output-6-Test-V4.csv', sep=',', header=0)

In [14]:
le = preprocessing.LabelEncoder()

#convert the categorical columns into numeric
df_test['GroupDN'] = le.fit_transform(df_test['GroupDN'])
df_test['Member'] = le.fit_transform(df_test['Member'])
df_test['BusinessGroup'] = le.fit_transform(df_test['BusinessGroup'])
df_test['Platform'] = le.fit_transform(df_test['Platform'])
#df_test['Title'] = le.fit_transform(df_test['Title'])
#df_test['Department'] = le.fit_transform(df_test['Department'])
df_test['Country'] = le.fit_transform(df_test['Country'])
#df_test['State'] = le.fit_transform(df_test['State'])
df_test['City'] = le.fit_transform(df_test['City'])
#display the initial records
#df_test.head()

In [15]:
X_train = df_test
pred = cart_model.predict(X_train)
print(pred)
print(result)
pred.sum()

[1 0 0 0 ... 1 1 0 0]
0.9794871794871794


2561

In [16]:
# feature_imp = pd.Series(cart_model.feature_importances_,index=df_test.columns.values).sort_values(ascending=False)
# feature_imp

AttributeError: 'GaussianNB' object has no attribute 'feature_importances_'

In [None]:
#df_test['Member']

In [None]:
#df_test['Member'] = le.inverse_transform(df_test['Member'])
final_df = pd.DataFrame({'GroupDN': df_test['GroupDN'],'Member': df_test['Member'], 'Manager': df_test['Manager'], 'Owner': pred})
final_df.to_csv(path/'MLProcessed1-Sklearn.csv', header=True, index=False)