# Text Classification

This program allows you to train 3 different Naive Bayes classifiers to do the chunk text classification.

In [29]:
import numpy as np
import pandas as pd
import csv
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.model_selection import cross_val_score


def SampleData(dataset):
    import pandas as pd
    df=pd.read_csv(dataset, encoding = "ISO-8859-1")
    #with pd.option_context('max_colwidth',160):
        #display(df.head())
    return df.head()


def createVocabList(dataSet):
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet|set(document)
    return list(vocabSet)

def vecVocab(vocabList,inputSet):
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
        else: print('the word: %s is not in my Vocabulary' % word)
    return returnVec

def textParse(bigString):
    import re
    #listOfTokens=re.split(r'\W*',bigString)
    listOfTokens=re.split(r'[^A-Za-z]+',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def featDataset(dataset):

    output=dataset[:-4]+'_Vectorized.txt'
    with open(output,"w") as w:
        with open(dataset,'r', encoding='utf-8-sig') as f:
            rows = csv.reader(f, (line.replace('\r', '') for line in f))
            labels = [row[0] for row in rows]
            f.seek(0)
            text = [row[1] for row in rows]
            
            #print(text)
            #print(labels)
            parsedText=list(map(textParse,text))            
            vocabList=createVocabList(parsedText)
            for word in vocabList:
                w.write(word+',')
            w.write('class\n')
            for i in range(len(labels)):
                returnVec=vecVocab(vocabList,parsedText[i])
                for num in returnVec:
                    w.write(str(num)+',')
                w.write(labels[i]+"\n")
            return(vocabList)        

def loadDataSet(dataset): 
    with open(dataset) as f:
        data=f.readlines()
        attributes=data[0].rstrip().split(',')[:-1]
        #print("attributes",len(attributes))
        instances=[entry.rstrip().split(',')[:-1] for entry in data[1:]]
        dataArray=[]
        for i in range(len(instances[0])):
            try:
                dataArray.append([float(instance[i]) for instance in instances])
            except:
                encodedData,codeBook=encode([instance[i] for instance in instances])
                dataArray.append(encodedData)
                print(attributes[i],': ',list(codeBook.items()))
        instances=np.array(dataArray).T
        labels=[entry.rstrip().split(',')[-1] for entry in data[1:]]
        #print(instances,labels)
        return instances,labels

def predict(testset):
    if "clf_B" in globals():
        prediction=clf_B.predict(testset)
        print("BernoulliNB: ",prediction)
    if "clf_G" in globals():
        prediction=clf_G.predict(testset)
        print("GaussianNB: ",prediction)
    if "clf_M" in globals():
        prediction=clf_M.predict(testset)
        print("MultinomialNB: ",prediction)
        
        
def predict_csv(testset):
    dict_list=[]
    fnames = ['Text','BernoulliNB_result','GaussianNB_Result','MultinomialNB_result']

    with open(testset,'r', encoding='utf-8-sig') as f:
        rows = csv.reader(f, (line.replace('\r', '') for line in f))
        text = [row[0] for row in rows]
        #print(text)
        parsedText=list(map(textParse,text))
        for t in text:
            returnVec=vecVocab(vocabList,textParse(t))
            feat_t=np.array(returnVec).reshape(1, -1)
            B_result = str(clf_B.predict(feat_t)).strip("['']")
            G_result = str(clf_G.predict(feat_t)).strip("['']")
            M_result = str(clf_M.predict(feat_t)).strip("['']")
                
            result_dict={'Text': t, 'BernoulliNB_result': B_result, 'GaussianNB_Result': G_result, 'MultinomialNB_result': M_result}
            dict_list.append(result_dict)
            
            
        with open(outFile, 'w') as of:
            writer = csv.DictWriter(of, fieldnames=fnames)
            writer.writeheader()
            
            for result_dict in dict_list:
                writer.writerow(result_dict)
                
                

In [30]:
#Sample the train dataset
dataset = 'train_set.csv'
sample = SampleData(dataset)
print(sample)

#generate the vocab list
text=[instance.rstrip() for instance in sample.iloc[:,1]]
parsedText=list(map(textParse,text))
vocabList=createVocabList(parsedText)
#print('Vocabulary List: \n',vocabList)

#featurize(vectorize) dataset
vocabList=featDataset(dataset)
print('The featurization is done!')

  class                                            content
0    No                            Landmark Center, 8th Fl
1    No  Contact: The C3 team at MakemeC3@cic.us -- Add...
2    No  A powerful tool for developers, the MySQL Data...
3    No              Easy access to T, Hubway, and parking
4   Yes                      Check out our Private Offices
The featurization is done!


# Evaluate 3 different classifiers

We used 3 different algorithms to do the classification. 

The block below uses k-fold cross validation to calculate the accuracy of each algorithm, so that we could later adopt the algorithm that performs the best in this case.

In [35]:
#Evaluate different classifiers

instances, labels = loadDataSet(dataset[:-4]+'_Vectorized.txt')

clf_B = BernoulliNB()
clf_B.fit(instances, labels)

clf_G = GaussianNB()
clf_G.fit(instances, labels)

clf_M = MultinomialNB()
clf_M.fit(instances, labels)

n_foldCV = 5
B_scores = cross_val_score(clf_B, instances, labels, cv=n_foldCV)

print("======BernoulliNB======")
print(B_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (B_scores.mean(), B_scores.std() * 2))

G_scores = cross_val_score(clf_G, instances, labels, cv=n_foldCV)
print("======GaussianNB======")
print(G_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (G_scores.mean(), G_scores.std() * 2))
        
M_scores = cross_val_score(clf_M, instances, labels, cv=n_foldCV)
print("======MultinomialNB======")
print(M_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (M_scores.mean(), M_scores.std() * 2))


[0.58823529 0.8        0.8        0.8        0.71428571]
Accuracy: 0.74 (+/- 0.17)
[0.64705882 0.73333333 0.66666667 0.66666667 0.92857143]
Accuracy: 0.73 (+/- 0.21)
[0.76470588 0.66666667 0.8        0.6        0.78571429]
Accuracy: 0.72 (+/- 0.15)




# Predict results

The block below performs all three classifiers and generate 3 columns in the output csv file to present the results of the prediction.

For 'testset', input the csv file which contains the the text that you want to predict.

For 'outFile', enter the out put file name. 
For the sample output, see test_set_predict.csv

In [32]:
#predict results

testset = 'test_set.csv'
outFile = 'test_set_predict.csv'

predict_csv(testset)
print('Prediction finished')

the word: workshop is not in my Vocabulary
the word: brookline is not in my Vocabulary
the word: lots is not in my Vocabulary
the word: micro is not in my Vocabulary
the word: off is not in my Vocabulary
the word: beacon is not in my Vocabulary
the word: solopreneurs is not in my Vocabulary
the word: telecommuters is not in my Vocabulary
the word: required is not in my Vocabulary
the word: notice is not in my Vocabulary
the word: leave is not in my Vocabulary
the word: tell is not in my Vocabulary
the word: deposit is not in my Vocabulary
the word: required is not in my Vocabulary
the word: insurance is not in my Vocabulary
the word: certificate is not in my Vocabulary
the word: required is not in my Vocabulary
the word: default is not in my Vocabulary
the word: theme is not in my Vocabulary
the word: presents is not in my Vocabulary
the word: clear is not in my Vocabulary
the word: concise is not in my Vocabulary
the word: beautiful is not in my Vocabulary
the word: form is not in my 