### SPAM Ham Detection

In [175]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [176]:
## Reading the given dataset
#spam = pd.read_csv("Export_loop-sentiment-pos-neg-train_05112020000000.csv", sep = "\t", names=["label", "text"])
data1 = pd.read_csv("Export_loop-sentiment-pos-neg-train_05112020000000.csv", sep = ",")

In [177]:
print(data1.head())

      label                                               text
0  Negative  No one cares about marketing slides - a techni...
1  Positive  Are all three hosts providing storage/capacity...
2  Negative  would loved to had managed to get down to the ...
3  Negative  Vending machine at work is out of Dasani water...
4  Positive  RT @VMwareEdu: Paul Maritz, CEO and President ...


In [244]:
words_remove = ["i","you","edu","can","lines","what", "there","all","we",
                "one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as",
                "was","if","they","are","this","and","it","have","has","from","at","my","be","by","not","that","to",
                "from","com","org","like","likes","so","said","from","what","told","over","more","other",
                "have","last","with","this","that","such","when","been","says","will","also","where","why",
                "would","today", "in", "you", "r", "d", "u", "hw","wat", "oly", "s", "b", "ht", 
                "rt", "p","the","th", "n", "was","rsvp","http"]

In [219]:
def cleanlist(df, words_to_remove = words_remove): 
    'removing re-tweet, user mention, hashtags and urls in dataframe level'
   
    # remove emoticons form the dataframe if any
    df['text'] = df['text'].replace(r'<ed>','', regex = True)
    df['text'] = df['text'].replace(r'\B<U+.*>|<U+.*>\B|<U+.*>','', regex = True)
    
    # convert all strings to lowercase
    df['text'] = df['text'].str.lower()
    
    #remove user mentions
    df['text'] = df['text'].replace(r'^(@\w+)',"", regex=True)
    
    #remove 'rt' in the beginning
    #df['text'] = df['text'].replace(r'^(rt @)',"", regex=True)
    
    #remove_symbols 
    df['text'] = df['text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    #remove punctuations 
    df['text'] = df['text'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)

    #remove_URL(x):
    df['text'] = df['text'].replace(r'https.*$', "", regex = True)
    
    #remove words of length 1 or 2 
    df['text'] = df['text'].replace(r'\b[a-zA-Z]{1,2}\b','', regex=True)

    #remove numbers of length 1 or 2 and years
    df['text'] = df['text'].replace(r'\b[0-9]{2,4}\b','', regex=True)
    
    #remove extra spaces in the feedbacks
    df['text'] = df['text'].replace(r'^\s+|\s+$'," ", regex=True)
     
    
    #remove stopwords and words_to_remove
    stop_words = set(stopwords.words('english'))
    mystopwords = [stop_words, "via", words_to_remove]
    
    df['full_text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in mystopwords]))
    

    return df

#get the processed feedback data
feedback = cleanlist(data1)

In [220]:
print(feedback.head(200))

        label                                               text  \
0    Negative   one cares about marketing slides    technical...   
1    Positive  are all three hosts providing storage capacity...   
2    Negative  would loved  had managed  get down  the camden...   
3    Negative    vending machine  work  out  dasani water    boo   
4    Positive  vmwareedu  paul maritz  ceo and president  vmw...   
..        ...                                                ...   
195  Positive  the vmware vcloud air disaster recovery campai...   
196  Negative   the everything admin unfortunately   looked t...   
197  Positive  only  you remove and  add from the gui   power...   
198  Negative  yup  had this happen twice  both times  had   ...   
199  Positive   done both   get more out   person  some  the ...   

                                             full_text  
0    one cares about marketing slides technical how...  
1    are all three hosts providing storage capacity...  
2    would l

In [221]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set = []
for index,row in feedback.iterrows():
    data_set.append((row['text'], row['label']))

In [222]:
print(data_set[:50])

[(' one cares about marketing slides    technical how    lot better', 'Negative'), ('are all three hosts providing storage capacity    one  witness   all three have capacity drives  what  you  with the witness   and with them being nucs  just how are you handling capacity and cache      really wondering about two node performance    currently have three hosts but one  them has freenas virtualized and all the disk slots  that host are dedicated for that   could add  das shelf  something  but   very curious what   like running  san with both capacity cache only attached  two nodes and  far  the cluster  aware the third host  strictly only   host  and  that  where   heading   the starwind route going  offer something for performance and reliability with two storage nodes that vmware vsan wouldn   able  match with only two storage nodes ', 'Positive'), ('would loved  had managed  get down  the camden crawl show  but    leetle too far away', 'Negative'), ('vending machine  work  out  dasani

In [223]:
print(type(data_set))

<class 'list'>


In [224]:
print(len(data_set))

1900


### Preprocessing

In [225]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [226]:
def preprocess(document, stem=True):
    'changes document to lower case, and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)
    



    return document

In [245]:
## - Performing the preprocessing steps on all messages
messages_set = []
for (message, label) in data_set:
    words_filtered = [e for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set.append((words_filtered, label))

In [246]:
print(messages_set[:50])

[(['one', 'care', 'about', 'market', 'slide', 'technical', 'how', 'lot', 'better'], 'Negative'), (['all', 'three', 'host', 'provide', 'storage', 'capacity', 'one', 'witness', 'all', 'three', 'have', 'capacity', 'drive', 'what', 'you', 'with', 'the', 'witness', 'and', 'with', 'them', 'nucs', 'just', 'how', 'you', 'handle', 'capacity', 'and', 'cache', 'really', 'wonder', 'about', 'two', 'node', 'performance', 'currently', 'have', 'three', 'host', 'but', 'one', 'them', 'have', 'freenas', 'virtualized', 'and', 'all', 'the', 'disk', 'slot', 'that', 'host', 'dedicate', 'for', 'that', 'could', 'add', 'das', 'shelf', 'something', 'but', 'very', 'curious', 'what', 'like', 'run', 'san', 'with', 'both', 'capacity', 'cache', 'only', 'attach', 'two', 'nod', 'and', 'far', 'the', 'cluster', 'aware', 'the', 'third', 'host', 'strictly', 'only', 'host', 'and', 'that', 'where', 'head', 'the', 'starwind', 'route', 'offer', 'something', 'for', 'performance', 'and', 'reliability', 'with', 'two', 'storage', 

### Preparing to create features

In [229]:
## - creating a single list of all words in the entire dataset for feature list creation

def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
      all_words.extend(message)
    return all_words

In [230]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words
## Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.

def get_word_features(wordlist):

    #print(wordlist[:10])
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [231]:
## - creating the word features for the entire dataset
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

5199


### Preparing to create a train and test set

In [232]:
## - creating slicing index at 80% threshold
sliceIndex = int((len(messages_set)*.8))

In [233]:
## - shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(messages_set)

In [234]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [235]:
len(train_messages)
len(test_messages)

380

### Preparing to create feature maps for train and test data

In [236]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [237]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [238]:
print(training_set[:5])

[({'contains(one)': True, 'contains(care)': False, 'contains(about)': False, 'contains(market)': False, 'contains(slide)': False, 'contains(technical)': False, 'contains(how)': False, 'contains(lot)': False, 'contains(better)': False, 'contains(all)': False, 'contains(three)': False, 'contains(host)': True, 'contains(provide)': False, 'contains(storage)': False, 'contains(capacity)': False, 'contains(witness)': False, 'contains(have)': True, 'contains(drive)': False, 'contains(what)': True, 'contains(you)': True, 'contains(with)': True, 'contains(the)': True, 'contains(and)': True, 'contains(them)': False, 'contains(nucs)': False, 'contains(just)': True, 'contains(handle)': True, 'contains(cache)': False, 'contains(really)': False, 'contains(wonder)': False, 'contains(two)': False, 'contains(node)': False, 'contains(performance)': False, 'contains(currently)': False, 'contains(but)': False, 'contains(freenas)': False, 'contains(virtualized)': False, 'contains(disk)': False, 'contains(s

In [239]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  1520
Test set size :  380


### Training

In [240]:
## Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation

In [241]:
## - Analyzing the accuracy of the training set
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9223684210526316


In [242]:
## Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.7052631578947368


In [243]:
m = pd.read_csv("sentiment-eval.csv", sep = ",")

## Testing a example message with our newly trained classifier

print('Classification result : ', spamClassifier.classify(extract_features(m)))

Classification result :  Negative


In [82]:
## Testing a example message with our newly trained classifier
#m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
#print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

In [88]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(30))

Most Informative Features
          contains(http) = True           Positi : Negati =     44.9 : 1.0
         contains(error) = True           Negati : Positi =      9.6 : 1.0
           contains(own) = True           Positi : Negati =      9.0 : 1.0
         contains(sleep) = True           Negati : Positi =      8.8 : 1.0
      contains(congrats) = True           Positi : Negati =      8.4 : 1.0
          contains(damn) = True           Negati : Positi =      8.1 : 1.0
 contains(documentation) = True           Negati : Positi =      8.1 : 1.0
          contains(join) = True           Positi : Negati =      7.8 : 1.0
       contains(vmworld) = True           Positi : Negati =      6.8 : 1.0
          contains(free) = True           Positi : Negati =      6.8 : 1.0
          contains(most) = True           Positi : Negati =      6.8 : 1.0
        contains(easier) = True           Positi : Negati =      6.7 : 1.0
    contains(enterprise) = True           Positi : Negati =      6.7 : 1.0

In [28]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
