In [None]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

df= pd.read_table('SMSSpamCollection', header=None, encoding='utf-8') 

In [None]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [None]:
#1=spam
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
y =encoder.fit_transform(classes)
print(y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [None]:
text_messages=df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [None]:
# regular expressions

processed= text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

processed= text_messages.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)$','webaddress')
processed= processed.str.replace(r'\$', 'moneysym')
processed= processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{4}$', 'phonenumber')
processed= processed.str.replace(r'\d+(\.\d+)?','number')
processed= processed.str.replace(r'[^\w\d\s]',' ')
processed= processed.str.replace(r'\s+',' ')
processed= processed.str.replace(r'^\s+|\s+?$', '')


In [None]:
processed=processed.str.lower()

print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [None]:
#remove stop words
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

processed= processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
#steam porter stemmer
ps= nltk.PorterStemmer()
processed= processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [None]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbernd time tri number contact u u number po...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [None]:
#tokenize
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

all_words=[]

for message in processed:
  words=word_tokenize(message)
  for w in words:
    all_words.append(w)

all_words=nltk.FreqDist(all_words)

In [None]:
print('Number of words:{}'.format(len(all_words)))
print('Most common words:{}'.format(all_words.most_common(15)))

Number of words:6567
Most common words:[('number', 3071), ('u', 1207), ('call', 679), ('go', 456), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261)]


In [None]:
word_features= list(all_words.keys())[:1500]

def find_features(messages):
  words=word_tokenize(message)
  features={}
  for word in word_features:
    features[word]=(word in words)

  return features

#example
features =find_features(processed[0])
for key ,value in features.items():
  if value==True:
    print(key)

In [None]:
processed[0]

5572

In [None]:

messages= zip(processed,y)
#reproductivity
seed=1
np.random.seed=seed

featuresets=[(find_features(text),label)for (text,label)in messages]

In [None]:
from sklearn import model_selection

training ,testing= model_selection.train_test_split(featuresets,test_size=0.25,random_state=1)

In [None]:
print('Training:{}'.format(len(training)))
print('Testing:{}'.format(len(testing)))

Training:4179
Testing:1393


In [None]:
#deploying scikit-learn classifiers with nltk

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix



In [None]:
names=['K Nearest Neighbors','Decision Tree','Random Forest','SGD Classifier','naive bayes', 'SVM Linear']
classifiers=[
             
             KNeighborsClassifier(),
             DecisionTreeClassifier(),
             RandomForestClassifier(),
             LogisticRegression(),
             SGDClassifier(max_iter=100),
             MultinomialNB(),
             SVC(kernel='linear')

]

models=zip(names, classifiers)
print(models)

<zip object at 0x7f302861cf00>


In [None]:

from nltk.classify.scikitlearn import SklearnClassifier
for name,model in models:
  nltk_model= SklearnClassifier(model)
  nltk_model.train(training)
  accuracy=nltk.classify.accuracy(nltk_model, testing)*100
  print('{}:Accuracy:{}'.format(name,accuracy))


In [None]:
#voting
from sklearn.ensemble import VotingClassifier

names=['K Nearest Neighbors','Decision Tree','Random Forest','SGD Classifier','naive bayes', 'SVM Linear']
classifiers=[
             KNeighborsClassifier(),
             DecisionTreeClassifier(),
             RandomForestClassifier(),
             LogisticRegression(),
             SGDClassifier(max_iter=100),
             MultinomialNB(),
             SVC(kernel='linear'),
             ]

models = zip(names, classifiers)


In [None]:
nltk_ensemble= SklearnClassifier(VotingClassifier(estimators = models, voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble, testing)*100
print('Ensembled accuracy:{}'.format(accuracy))
#95.5

TypeError: ignored

In [None]:
txt_features, labels= zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)

AttributeError: ignored

In [None]:
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels,prediction),
    index=[['actual','actual'],['ham','spam']],
    columns=[['predicted','predicted'],['ham','spam']])


NameError: ignored