In [9]:
import pandas as pd
data = pd.read_csv('EK0bZ7sBRRiKU2kKs6fx_spamdata-1562916291191.csv', encoding = 'latin-1')
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [16]:
#it is time to clean the text data
import string
punctuations = string.punctuation

from nltk.corpus import stopwords
stopword = stopwords.words("english")

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()


def cleanText(text):
    #first convert to lower case
    cleanText = text.lower()
    #removing the punctuations
    cleanText = "".join(word for word in cleanText if word not in punctuations)
    #time to remove the stopwords as they are basically noise
    words = cleanText.split()
    words = [word for word in words if word not in stopword]
    cleanText = " ".join(words)
    #time to perform lemmatization
    words = [lem.lemmatize(word, "v") for word in words]
    words = [lem.lemmatize(word, "n") for word in words]
    return cleanText

#testing the function out
cleanText("I will, be playing a game today!")

'playing game today'

In [17]:
data["cleanText"] = data["text"].apply(cleanText)
data.head()

Unnamed: 0,label,text,cleanText
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [20]:
#Time to implement feature engineering
#We can generate some meta features such as word count, character count

data["rawWordCount"] = data["text"].apply(lambda x : len(x.split()))
data["cleanWordCount"] = data["cleanText"].apply(lambda x : len(x.split()))
data['characterCount'] = data['text'].apply(lambda x :len(x))
data['characterCountWithoutSpace'] = data['text'].apply(lambda x :len(x.replace(" ", "")))
data["digitOccurrece"] = data["text"].apply(lambda x: sum([1 if word.isdigit() else 0 for word in x.split()]))

In [21]:
data

Unnamed: 0,label,text,cleanText,rawWordCount,cleanWordCount,characterCount,characterCountWithoutSpace,digitOccurrece
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,29,24,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though,13,8,61,49,0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling 3 weeks word back id like ...,32,19,150,119,1
6,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent,16,8,77,62,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...,26,16,160,135,0
8,spam,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...,26,18,160,135,1
9,spam,Had your mobile 11 months or more? U R entitle...,mobile 11 months u r entitled update latest co...,29,18,154,126,2


In [23]:
#now we can focus on natural language language based features
#such as number of nouns present, number of words present

pos_dic = {"noun": ["NNP" , "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD", "VBN", "VBG"]}

def partOfSpeechTag(text, family):
    #tokenize the sentence
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    count  = 0
    #get the tags of the tokenized words
    for tag in tags:
        #get the part of speech tag
        tag = tag[1]
        #check if it is present in the predefined dictionary of ours
        if tag in pos_dic[family]:
            count += 1
    return count

#test the function out
partOfSpeechTag("They are playing in the ground", "verb")

1

In [24]:
data["nounCount"] = data["text"].apply(lambda x : partOfSpeechTag(x, "noun"))
data["verbCount"] = data["text"].apply(lambda x : partOfSpeechTag(x, "verb"))
data

Unnamed: 0,label,text,cleanText,rawWordCount,cleanWordCount,characterCount,characterCountWithoutSpace,digitOccurrece,nounCount,verbCount
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,29,24,0,4,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2,13,3
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0,3,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though,13,8,61,49,0,1,4
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling 3 weeks word back id like ...,32,19,150,119,1,9,8
6,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent,16,8,77,62,0,3,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...,26,16,160,135,0,12,4
8,spam,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...,26,18,160,135,1,11,6
9,spam,Had your mobile 11 months or more? U R entitle...,mobile 11 months u r entitled update latest co...,29,18,154,126,2,11,3


In [26]:
#lets look at some advanced features
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#lets create their objects
cvz = CountVectorizer()
#we will fit the clean data over here
cvz.fit(data["cleanText"].values)
count_vectors = cvz.transform(data["cleanText"].values)
count_vectors

<5572x9373 sparse matrix of type '<class 'numpy.int64'>'
	with 47219 stored elements in Compressed Sparse Row format>

In [49]:
word_tfidf = TfidfVectorizer(max_features = 500)
#we will fit the clean data over here
word_tfidf.fit(data["cleanText"].values)
word_vectors_tfidf = word_tfidf.transform(data["cleanText"].values)
word_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 25793 stored elements in Compressed Sparse Row format>

In [50]:
tfidf = dict(zip(word_tfidf.get_feature_names(), word_tfidf.idf_))
tfidf_idf = pd.DataFrame(columns = ["word_tfdif"]).from_dict(tfidf, orient = "index")
tfidf_idf.columns = ["word_tfidf"]
tfidf_idf

Unnamed: 0,word_tfidf
10,6.581166
100,5.962127
1000,5.888019
150,5.888019
150p,6.406813
150ppm,6.070341
16,5.693863
18,5.841499
1st,6.014771
2000,5.988103


In [54]:
#we now havea lot of features to play with. We now have to combine it all
# we will create a sparse matrix
from scipy.sparse import hstack, csr_matrix
meta_features = [ 'rawWordCount', 'cleanWordCount',
       'characterCount', 'characterCountWithoutSpace', 'digitOccurrece',
       'nounCount', 'verbCount']
feature_set1 = data[meta_features]
train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")
train

<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 59165 stored elements in Compressed Sparse Row format>

In [55]:
#it is time for classification, but before that we have to label encode the label column
from sklearn.preprocessing import LabelEncoder
target = data["label"].values
target = LabelEncoder().fit_transform(target)
target

array([0, 0, 1, ..., 0, 0, 0])

In [58]:
from sklearn.model_selection import train_test_split
train_x, validation_x, train_y, validation_y = train_test_split(train, target)

In [59]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import accuracy_score

In [60]:
model = naive_bayes.MultinomialNB()
model.fit(train_x, train_y)
preds = model.predict(validation_x)
accuracy_score(preds, validation_y)

0.9698492462311558

In [61]:
model = LogisticRegression()
model.fit(train_x, train_y)
preds = model.predict(validation_x)
accuracy_score(preds, validation_y)



0.9633883704235463

In [62]:
model = svm.SVC()
model.fit(train_x, train_y)
preds = model.predict(validation_x)
accuracy_score(preds, validation_y)
#SVM works well for text classification when there is a large data set



0.9289303661162958

In [63]:
#ensemble methods
model = ensemble.ExtraTreesClassifier()
model.fit(train_x, train_y)
preds = model.predict(validation_x)
accuracy_score(preds, validation_y)



0.9777458722182341