In [225]:
import pandas as pd
from sklearn.utils import shuffle
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

# Read Data

In [226]:
df = pd.read_excel("CRDM_Data_Load.xlsx")

## Combining Description and Value

In [227]:
df['Des_Val_combined']=df.apply(lambda x:'%s %s' % (x['DESCRIPTION'],x['VALUE_ID']),axis=1)
#df

# Data prepration for  Class 1

In [228]:
Class_1_df=df.filter(['Des_Val_combined','Class_1'], axis=1)
Class_1_List= Class_1_df.values.tolist()
Class_1_List= shuffle(Class_1_List)
#Class_1_List

# Splitting for training and testing

In [229]:
size = int(len(Class_1_List) * 0.20)
train_set, test_set = Class_1_List[size:], Class_1_List[:size]
print('Train:',len(train_set),'Test:',len(test_set))
train_set=Class_1_List

Train: 1568 Test: 392


# Function for Feature Extraction

In [230]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return " ".join(filtered_words)

def document_features(post):
    cleansed_post=preprocess(post)
    features = {}
    for word in nltk.word_tokenize(cleansed_post):
          features['contains({})'.format(word.lower())] = True
    return features

### Converting data to Feature Vectors

In [231]:
train_featuresets = [(document_features(Value), class1) for (Value,class1) in train_set]
test_featuresets = [(document_features(Value), class1) for (Value,class1) in test_set]

# Train Using NaiveBayesClassifier


In [232]:
NaiveBayesClassifier = nltk.NaiveBayesClassifier.train(train_featuresets)

## Checking acuracy on Testing

In [233]:
print("NaiveBayesClassifier accuracy percent:", (nltk.classify.accuracy(NaiveBayesClassifier, test_featuresets))*100)

NaiveBayesClassifier accuracy percent: 6.887755102040815


## Other Classifier results

In [234]:
#MultinomialNB
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_featuresets)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_featuresets))*100)

#BernoulliNB
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train_featuresets)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, test_featuresets))*100)

#LogisticRegression
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_featuresets)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_featuresets))*100)

#Stochastic Gradient Descent 
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_featuresets)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_featuresets))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(train_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

#Linear Support Vector 
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_featuresets)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_featuresets))*100)

##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(train_featuresets)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_featuresets))*100)

MNB_classifier accuracy percent: 95.40816326530613
BernoulliNB_classifier accuracy percent: 76.78571428571429
LogisticRegression_classifier accuracy percent: 97.1938775510204
SGDClassifier_classifier accuracy percent: 98.21428571428571
LinearSVC_classifier accuracy percent: 98.72448979591837


## Pickle the Classifier

In [235]:
output = open('pickled/C1_NaiveBayesClassifier.pkl', 'wb')
pickle.dump(NaiveBayesClassifier, output)

output = open('pickled/C1_MNB_classifier.pkl', 'wb')
pickle.dump(MNB_classifier, output)

output = open('pickled/C1_BernoulliNB_classifier.pkl', 'wb')
pickle.dump(BernoulliNB_classifier, output)

output = open('pickled/C1_LogisticRegression_classifier.pkl', 'wb')
pickle.dump(LogisticRegression_classifier, output)

output = open('pickled/C1_SGDClassifier_classifier.pkl', 'wb')
pickle.dump(SGDClassifier_classifier, output)

output = open('pickled/C1_LinearSVC_classifier.pkl', 'wb')
pickle.dump(LinearSVC_classifier, output)

### Testing

In [236]:
SampleText='Singapore Dollar'

In [237]:
featuresets_SampleText= document_features(SampleText)

In [238]:
NaiveBayesClassifier.classify(featuresets_SampleText)

'General'

## Data Prepration for next Classifier Calss 2 'Persona'

In [239]:
Persona_df=df.loc[df['Class_1'] == 'Persona']
Class_2_df=Persona_df.filter(['Des_Val_combined','Class_2'], axis=1)
Persona_List= Class_2_df.values.tolist()
Persona_List= shuffle(Persona_List)
#Persona_List

In [240]:
size = int(len(Persona_List) * 0.20)
train_set, test_set = Persona_List[size:], Persona_List[:size]
train_set=Persona_List

In [241]:
train_featuresets = [(document_features(Value), class2) for (Value,class2) in train_set]
test_featuresets = [(document_features(Value), class2) for (Value,class2) in test_set]

# Traingn and Testing

In [242]:
NaiveBayesClassifier = nltk.NaiveBayesClassifier.train(train_featuresets)
print("NaiveBayesClassifier accuracy percent:", (nltk.classify.accuracy(NaiveBayesClassifier, test_featuresets))*100)

NaiveBayesClassifier accuracy percent: 36.734693877551024


In [243]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_featuresets)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_featuresets))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train_featuresets)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, test_featuresets))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_featuresets)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_featuresets))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_featuresets)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_featuresets))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(train_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_featuresets)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_featuresets))*100)

##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(train_featuresets)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_featuresets))*100)

MNB_classifier accuracy percent: 91.83673469387756
BernoulliNB_classifier accuracy percent: 73.46938775510205
LogisticRegression_classifier accuracy percent: 93.87755102040816
SGDClassifier_classifier accuracy percent: 91.83673469387756
LinearSVC_classifier accuracy percent: 95.91836734693877


In [244]:
output = open('pickled/C2_Persona_NaiveBayesClassifier.pkl', 'wb')
pickle.dump(NaiveBayesClassifier, output)

output = open('pickled/C2_Persona_MNB_classifier.pkl', 'wb')
pickle.dump(MNB_classifier, output)

output = open('pickled/C2_Persona_BernoulliNB_classifier.pkl', 'wb')
pickle.dump(BernoulliNB_classifier, output)

output = open('pickled/C2_Persona_LogisticRegression_classifier.pkl', 'wb')
pickle.dump(LogisticRegression_classifier, output)

output = open('pickled/C2_Persona_SGDClassifier_classifier.pkl', 'wb')
pickle.dump(SGDClassifier_classifier, output)

output = open('pickled/C2_Persona_LinearSVC_classifier.pkl', 'wb')
pickle.dump(LinearSVC_classifier, output)

# For General

In [245]:
General_df=df.loc[df['Class_1'] == 'General']
Class_2_df=General_df.filter(['Des_Val_combined','Class_2'], axis=1)
General_List= Class_2_df.values.tolist()
General_List= shuffle(General_List)
#Persona_List
size = int(len(General_List) * 0.20)
train_set, test_set = General_List[size:], General_List[:size]
train_set=General_List
train_featuresets = [(document_features(Value), class2) for (Value,class2) in train_set]
test_featuresets = [(document_features(Value), class2) for (Value,class2) in test_set]
NaiveBayesClassifier = nltk.NaiveBayesClassifier.train(train_featuresets)
print("NaiveBayesClassifier accuracy percent:", (nltk.classify.accuracy(NaiveBayesClassifier, test_featuresets))*100)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_featuresets)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_featuresets))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train_featuresets)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, test_featuresets))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_featuresets)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_featuresets))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_featuresets)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_featuresets))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(train_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_featuresets)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_featuresets))*100)

##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(train_featuresets)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_featuresets))*100)

NaiveBayesClassifier accuracy percent: 92.5925925925926
MNB_classifier accuracy percent: 91.85185185185185
BernoulliNB_classifier accuracy percent: 57.03703703703704
LogisticRegression_classifier accuracy percent: 97.77777777777777
SGDClassifier_classifier accuracy percent: 100.0
LinearSVC_classifier accuracy percent: 100.0


In [246]:
output = open('pickled/C2_General_NaiveBayesClassifier.pkl', 'wb')
pickle.dump(NaiveBayesClassifier, output)

output = open('pickled/C2_General_MNB_classifier.pkl', 'wb')
pickle.dump(MNB_classifier, output)

output = open('pickled/C2_General_BernoulliNB_classifier.pkl', 'wb')
pickle.dump(BernoulliNB_classifier, output)

output = open('pickled/C2_General_LogisticRegression_classifier.pkl', 'wb')
pickle.dump(LogisticRegression_classifier, output)

output = open('pickled/C2_General_SGDClassifier_classifier.pkl', 'wb')
pickle.dump(SGDClassifier_classifier, output)

output = open('pickled/C2_General_LinearSVC_classifier.pkl', 'wb')
pickle.dump(LinearSVC_classifier, output)

# Testing and training Class 3

In [247]:
Personal_df=df.loc[df['Class_2']=='Personal']
#Personal_df=df.loc[df['Class_1']!='Domine_Specific']
Class_2_df=Personal_df.filter(['Des_Val_combined','Class_3'], axis=1)
Personal_List= Class_2_df.values.tolist()
Personal_List= shuffle(Personal_List)
size = int(len(Personal_List) * 0.20)
train_set, test_set = Personal_List[size:], Personal_List[:size]
train_set=Personal_List

In [248]:
train_featuresets = [(document_features(Value), class2) for (Value,class2) in train_set]
test_featuresets = [(document_features(Value), class2) for (Value,class2) in test_set]

In [249]:
NaiveBayesClassifier = nltk.NaiveBayesClassifier.train(train_featuresets)
print("NaiveBayesClassifier accuracy percent:", (nltk.classify.accuracy(NaiveBayesClassifier, test_featuresets))*100)

NaiveBayesClassifier accuracy percent: 94.11764705882352


In [250]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_featuresets)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_featuresets))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train_featuresets)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, test_featuresets))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_featuresets)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_featuresets))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_featuresets)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_featuresets))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(train_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_featuresets)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_featuresets))*100)

##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(train_featuresets)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_featuresets))*100)

MNB_classifier accuracy percent: 94.11764705882352
BernoulliNB_classifier accuracy percent: 88.23529411764706
LogisticRegression_classifier accuracy percent: 94.11764705882352
SGDClassifier_classifier accuracy percent: 94.11764705882352
LinearSVC_classifier accuracy percent: 94.11764705882352


In [251]:
output = open('pickled/C3_Personal_NaiveBayesClassifier.pkl', 'wb')
pickle.dump(NaiveBayesClassifier, output)

output = open('pickled/C3_Personal_MNB_classifier.pkl', 'wb')
pickle.dump(MNB_classifier, output)

output = open('pickled/C3_Personal_BernoulliNB_classifier.pkl', 'wb')
pickle.dump(BernoulliNB_classifier, output)

output = open('pickled/C3_Personal_LogisticRegression_classifier.pkl', 'wb')
pickle.dump(LogisticRegression_classifier, output)

output = open('pickled/C3_Personal_SGDClassifier_classifier.pkl', 'wb')
pickle.dump(SGDClassifier_classifier, output)

output = open('pickled/C3_Personal_LinearSVC_classifier.pkl', 'wb')
pickle.dump(LinearSVC_classifier, output)