In [87]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [88]:
## Reading the given dataset
sentiment_train = pd.read_csv("Export_loop-sentiment-pos-neg-train_05112020000000.csv")
sentiment_test = pd.read_csv("sentiment-eval.csv")

In [89]:
print(sentiment_train.head())

      label                                               text
0  Negative  No one cares about marketing slides - a techni...
1  Positive  Are all three hosts providing storage/capacity...
2  Negative  would loved to had managed to get down to the ...
3  Negative  Vending machine at work is out of Dasani water...
4  Positive  RT @VMwareEdu: Paul Maritz, CEO and President ...


In [90]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set_train = []
for index,row in sentiment_train.iterrows():
    data_set_train.append((row['text'], row['label']))

In [91]:
data_set_test = []
for index,row in sentiment_test.iterrows():
    data_set_test.append(row['text'])

In [92]:
print(data_set_train[:5])

[('No one cares about marketing slides - a technical HOW TO is a lot better', 'Negative'), ("Are all three hosts providing storage/capacity? Or is one a witness? If all three have capacity drives, what'd you do with the witness? (and with them being NUCs, just how are you handling capacity and cache?)  I'm really wondering about two-node performance, as I currently have three hosts but one of them has FreeNAS virtualized and all the disk slots on that host are dedicated for that. I could add a DAS shelf or something, but I am very curious what it's like running v SAN with both capacity/cache only attached to two nodes and as far as the cluster is aware the third host is strictly only a VM host. And if that's where I'm heading, is the StarWind route going to offer something for performance and reliability with two storage nodes that VMware vSAN wouldn't be able to match with only two storage nodes?", 'Positive'), ("would loved to had managed to get down to the Camden Crawl show, but it'

In [93]:
print(data_set_test[:5])

["it's not like I haven't been talking about it for 6-8 years on my own blog.", 'With a modern data center, you can cut costs and improve #security. Join us in Houston tomorrow to learn more. http://bit.ly/2tH4MXR', "In today's storage landscape, there is a myriad of choices! Find out why #VMware managed storage is your answer:  http://bit.ly/2VuMzKn", "I hoped that it could be done via GUI.  I didn't even know about PowerCLI, but I'll try it [I'm a total newb, so not sure if I'll succeed].  I just can't understand why such an obviously demanded feature is not in the box...  Thanks for your help.", 'math final tomorrow...im getting an ulcer over it']


In [94]:
print(len(data_set_train))

1900


In [95]:
print(len(data_set_test))

211


### Preprocessing

In [96]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer() 
wordnet_lemmatizer = WordNetLemmatizer()


In [97]:
def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [98]:
## - Performing the preprocessing steps on all messages
messages_set_train = []
for (message, label) in data_set_train:
    words_filtered = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set_train.append((words_filtered, label))

In [99]:
messages_set_test = []
for (message) in data_set_test:
    words_filtered = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set_test.append((words_filtered))

In [100]:
print(messages_set_test[:20])

[['like', "n't", 'talk', '6-8', 'years', 'blog'], ['modern', 'data', 'center', 'cut', 'cost', 'improve', 'security', 'join', 'houston', 'tomorrow', 'learn', 'http', '//bit.ly/2th4mxr'], ['today', 'storage', 'landscape', 'myriad', 'choices', 'find', 'vmware', 'manage', 'storage', 'answer', 'http', '//bit.ly/2vumzkn'], ['hop', 'could', 'via', 'gui', "n't", 'even', 'know', 'powercli', "'ll", 'try', 'total', 'newb', 'sure', "'ll", 'succeed', "n't", 'understand', 'obviously', 'demand', 'feature', 'box', '...', 'thank', 'help'], ['math', 'final', 'tomorrow', '...', 'get', 'ulcer'], ['huh', '...', "n't", 'expect', 'show', 'twitter', 'time', 'fix'], ['labs', 'available', 'online', 'take', 'approach', 'world', 'use', 'time', 'sessions', 'last', 'year'], ['find', 'mismatch', 'mtu', 'settings', 'switch', 'storage', "n't", 'time', 'figure', 'exact', 'culprit', 'part', "n't", 'responsibility', 'scale', '1500', 'resolve', 'issue', 'immediately', 'would', 'suggest', 'use', 'jumbo', 'frame', 'dedicate

In [101]:
print(messages_set_train[:20])

[(['one', 'care', 'market', 'slide', 'technical', 'lot', 'better'], 'Negative'), (['three', 'host', 'provide', 'storage/capacity', 'one', 'witness', 'three', 'capacity', 'drive', 'witness', 'nucs', 'handle', 'capacity', 'cache', 'really', 'wonder', 'two-node', 'performance', 'currently', 'three', 'host', 'one', 'freenas', 'virtualized', 'disk', 'slot', 'host', 'dedicate', 'could', 'add', 'das', 'shelf', 'something', 'curious', 'like', 'run', 'san', 'capacity/cache', 'attach', 'two', 'nod', 'far', 'cluster', 'aware', 'third', 'host', 'strictly', 'host', 'head', 'starwind', 'route', 'offer', 'something', 'performance', 'reliability', 'two', 'storage', 'nod', 'vmware', 'vsan', 'would', "n't", 'able', 'match', 'two', 'storage', 'nod'], 'Positive'), (['would', 'love', 'manage', 'get', 'camden', 'crawl', 'show', 'leetle', 'far', 'away'], 'Negative'), (['vend', 'machine', 'work', 'dasani', 'water', '...', 'boo'], 'Negative'), (['vmwareedu', 'paul', 'maritz', 'ceo', 'president', 'vmware', 'nam

### Preparing to create features

In [102]:
## - creating a single list of all words in the entire dataset for feature list creation

def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
      all_words.extend(message)
    return all_words

In [103]:
def get_words_in_messages_2(messages):
    all_words = []
    for (message) in messages:
      all_words.extend(message)
    return all_words

In [104]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words
## Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.

def get_word_features(wordlist):

    #print(wordlist[:10])
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [69]:
## - creating the word features for the entire dataset
train_word_features = get_word_features(get_words_in_messages(messages_set_train))
print(len(train_word_features))

5862


In [70]:
test_word_features = get_word_features(get_words_in_messages_2(messages_set_test))
print(len(test_word_features))

1439


### Preparing to create a train and test set

In [52]:
## - creating slicing index at 80% threshold
#sliceIndex = int((len(messages_set)*.8))

In [53]:
## - shuffle the pack to create a random and unbiased split of the dataset
#random.shuffle(messages_set)

In [54]:
train_messages = messages_set_train

In [71]:
tes2_messages = messages_set_test

### Preparing to create feature maps for train and test data

In [55]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [72]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [73]:
print(training_set[:10])

[({'contains(one)': True, 'contains(care)': True, 'contains(market)': True, 'contains(slide)': True, 'contains(technical)': True, 'contains(lot)': True, 'contains(better)': True, 'contains(three)': False, 'contains(host)': False, 'contains(provide)': False, 'contains(storage/capacity)': False, 'contains(witness)': False, 'contains(capacity)': False, 'contains(drive)': False, 'contains(nucs)': False, 'contains(handle)': False, 'contains(cache)': False, 'contains(really)': False, 'contains(wonder)': False, 'contains(two-node)': False, 'contains(performance)': False, 'contains(currently)': False, 'contains(freenas)': False, 'contains(virtualized)': False, 'contains(disk)': False, 'contains(slot)': False, 'contains(dedicate)': False, 'contains(could)': False, 'contains(add)': False, 'contains(das)': False, 'contains(shelf)': False, 'contains(something)': False, 'contains(curious)': False, 'contains(like)': False, 'contains(run)': False, 'contains(san)': False, 'contains(capacity/cache)': F

In [86]:
print(testing_set[:10])

[({'contains(one)': False, 'contains(care)': False, 'contains(market)': False, 'contains(slide)': False, 'contains(technical)': False, 'contains(lot)': False, 'contains(better)': False, 'contains(three)': False, 'contains(host)': False, 'contains(provide)': False, 'contains(storage/capacity)': False, 'contains(witness)': False, 'contains(capacity)': False, 'contains(drive)': False, 'contains(nucs)': False, 'contains(handle)': False, 'contains(cache)': False, 'contains(really)': False, 'contains(wonder)': False, 'contains(two-node)': False, 'contains(performance)': False, 'contains(currently)': False, 'contains(freenas)': False, 'contains(virtualized)': False, 'contains(disk)': False, 'contains(slot)': False, 'contains(dedicate)': False, 'contains(could)': False, 'contains(add)': False, 'contains(das)': False, 'contains(shelf)': False, 'contains(something)': False, 'contains(curious)': False, 'contains(like)': False, 'contains(run)': False, 'contains(san)': False, 'contains(capacity/cac

In [74]:
print('Training set size : ', len(training_set))
#print('Test set size : ', len(testing_set))

Training set size :  1900


### Training

In [75]:
## Training the classifier with NaiveBayes algorithm
SentimentClassifier = nltk.NaiveBayesClassifier.train(training_set)
#SentimentClassifier_test = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation

In [78]:
## - Analyzing the accuracy of the test set
print(nltk.classify.accuracy(SentimentClassifier, training_set))

0.9278947368421052


In [79]:
## Analyzing the accuracy of the test set
print(nltk.classify.accuracy(SentimentClassifier, testing_set))

0.9315789473684211


In [85]:
m = pd.read_csv("sentiment-eval.csv", sep = ",")

## Testing a example message with our newly trained classifier

print('Classification result : ', SentimentClassifier.classify(extract_features(test_messages)))

TypeError: unhashable type: 'list'

In [155]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
          contains(http) = True           Positi : Negati =     32.4 : 1.0
      contains(congrats) = True           Positi : Negati =      8.0 : 1.0
          contains(shit) = True           Negati : Positi =      7.9 : 1.0
         contains(sorry) = True           Negati : Positi =      7.6 : 1.0
          contains(join) = True           Positi : Negati =      7.4 : 1.0
         contains(thank) = True           Positi : Negati =      7.3 : 1.0
        contains(delete) = True           Negati : Positi =      7.2 : 1.0
 contains(documentation) = True           Negati : Positi =      7.2 : 1.0
          contains(poor) = True           Negati : Positi =      7.2 : 1.0
          contains(page) = True           Negati : Positi =      7.0 : 1.0
      contains(consider) = True           Positi : Negati =      6.8 : 1.0
         contains(offer) = True           Positi : Negati =      6.8 : 1.0
         contains(share) = True           Positi : Negati =      6.7 : 1.0

In [128]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
