In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [505]:
# start with a simple example
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!','you will be alright']

In [506]:
simple_train

['call you tonight',
 'Call me a cab',
 'please call me... PLEASE!',
 'you will be alright']

In [507]:
# learn the 'vocabulary' of the training data
vect = CountVectorizer()
vect.fit(simple_train)

In [508]:
print(vect.get_feature_names())

['alright', 'be', 'cab', 'call', 'me', 'please', 'tonight', 'will', 'you']




In [509]:
#transform trainig data to documant term matrix
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [510]:
pd.DataFrame(simple_train_dtm.todense(),columns =vect.get_feature_names())

Unnamed: 0,alright,be,cab,call,me,please,tonight,will,you
0,0,0,0,1,0,0,1,0,1
1,0,0,1,1,1,0,0,0,0
2,0,0,0,1,1,2,0,0,0
3,1,1,0,0,0,0,0,1,1


In [511]:
###TFIDF vectorizer
tfvect=TfidfVectorizer()
tfvect.fit(simple_train)

In [512]:
tfvect.get_feature_names()



['alright', 'be', 'cab', 'call', 'me', 'please', 'tonight', 'will', 'you']

In [513]:
simple_train_dtm=tfvect.transform(simple_train)

In [514]:
pd.DataFrame(simple_train_dtm.todense(),columns=tfvect.get_feature_names())

Unnamed: 0,alright,be,cab,call,me,please,tonight,will,you
0,0.0,0.0,0.0,0.4481,0.0,0.0,0.702035,0.0,0.553492
1,0.0,0.0,0.702035,0.4481,0.553492,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.284626,0.35157,0.891844,0.0,0.0,0.0
3,0.525473,0.525473,0.0,0.0,0.0,0.0,0.0,0.525473,0.414289


In [515]:
#### Using n gram values
vect = TfidfVectorizer(ngram_range=(1,2), stop_words=['me','you'])
vect.fit(simple_train)

# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm.todense()

cv = pd.DataFrame(simple_train_dtm.todense())
cv.columns = vect.get_feature_names()
cv
#As we can see the columns names have been sorted in the alphabetical order



Unnamed: 0,alright,be,be alright,cab,call,call cab,call please,call tonight,please,please call,tonight,will,will be
0,0.0,0.0,0.0,0.0,0.411378,0.0,0.0,0.644503,0.0,0.0,0.644503,0.0,0.0
1,0.0,0.0,0.0,0.644503,0.411378,0.644503,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.252159,0.0,0.395056,0.0,0.790112,0.395056,0.0,0.0,0.0
3,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214


In [516]:
####defining the stop words

sw=['me', 'you', 'i', 'am']

In [517]:
## UDF for the lower case conversion
def low_case(x):
    x=x.lower()
    return x

In [518]:
tvect = TfidfVectorizer(analyzer='word',lowercase=False, preprocessor=low_case, ngram_range=(1,1), max_features=10, max_df=3, min_df=1, stop_words=sw)
tvect = tvect.fit(simple_train)

In [519]:
simple_train_tdtm = tvect.transform(simple_train)
pd.DataFrame(simple_train_tdtm.toarray(), columns=tvect.get_feature_names())



Unnamed: 0,alright,be,cab,call,please,tonight,will
0,0.0,0.0,0.0,0.538029,0.0,0.842926,0.0
1,0.0,0.0,0.842926,0.538029,0.0,0.0,0.0
2,0.0,0.0,0.0,0.304035,0.952661,0.0,0.0
3,0.57735,0.57735,0.0,0.0,0.0,0.0,0.57735


In [520]:
#### Testing on the above trained vocabulary

In [521]:
test = ["Are you alright"]
test_tdtm = tvect.transform(test)

In [522]:
test_dtm_df = pd.DataFrame(test_tdtm.toarray(), columns=tvect.get_feature_names())




In [523]:
test_dtm_df

Unnamed: 0,alright,be,cab,call,please,tonight,will
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [524]:
#### Working on SMS data

In [525]:
sms = pd.read_csv('D:/Python/Dataset/sms.csv')

In [526]:
sms

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [527]:
sms.label.value_counts(normalize=True)*100

ham     86.593683
spam    13.406317
Name: label, dtype: float64

In [528]:
### Converting Label to the numeric data
sms['label'] = sms.label.map({'ham':0, 'spam':1})

In [529]:
sms.head(10)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [530]:
##defining the X and Y

In [531]:
x=sms['message']
y=sms['label']

In [532]:
## SPlitting into the train and test split

In [533]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


In [534]:
### VEctorizing and cleaning the data

In [535]:
##X_train=["Nah I don't think he goes to usf, he lives around here though"]

In [536]:
from nltk.corpus import stopwords

In [537]:
stop=stopwords.words('english')

In [538]:
import re
def pre_process_text(x):
    x = x.lower()
    x = x.strip()
    x = re.sub(r' +', ' ', x)
    x = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", x)
    return(x)

In [539]:
vect = TfidfVectorizer(analyzer='word',lowercase=True, preprocessor=pre_process_text, ngram_range=(1,1), max_features=1000, max_df=0.95, min_df=10, stop_words=stop)



In [540]:
# learn training data vocabulary, then create document-term matrix
vect = vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm



<4457x705 sparse matrix of type '<class 'numpy.float64'>'
	with 23266 stored elements in Compressed Sparse Row format>

In [541]:
X_train_dtm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [542]:
### Feature/Token names
X_train_tokens = vect.get_feature_names()



In [543]:
print(X_train_tokens[:50])

['able', 'abt', 'account', 'actually', 'address', 'aft', 'afternoon', 'age', 'ah', 'aight', 'almost', 'alone', 'already', 'alright', 'also', 'always', 'amp', 'ampm', 'angry', 'another', 'answer', 'anyone', 'anything', 'anytime', 'anyway', 'apply', 'ard', 'around', 'ask', 'asked', 'asking', 'ass', 'attempt', 'auction', 'available', 'await', 'award', 'awarded', 'away', 'awesome', 'babe', 'baby', 'back', 'bad', 'beautiful', 'bed', 'believe', 'best', 'better', 'big']


In [544]:
doc_term_mat=pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [545]:
doc_term_mat

Unnamed: 0,able,abt,account,actually,address,aft,afternoon,age,ah,aight,...,yes,yesterday,yet,yo,youll,youre,youve,yr,yrs,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322759,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [546]:
### Model Building and training to classify the Spam and Ham

In [547]:
# train a Naive Bayes model using X_train_dtm
from sklearn.naive_bayes import MultinomialNB, GaussianNB
nb = MultinomialNB()
#nb = GaussianNB()
nb.fit(doc_term_mat, y_train)

In [548]:
X_test_dtm = vect.transform(X_test)

In [549]:
# make class predictions for X_test_dtm
y_pred_class_train = nb.predict(doc_term_mat)
y_pred_class = nb.predict(X_test_dtm.toarray())



In [550]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.9748878923766816


In [551]:
# confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_class))

[[966   2]
 [ 26 121]]


In [552]:
print(metrics.classification_report(y_train, y_pred_class_train))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3857
           1       0.98      0.84      0.91       600

    accuracy                           0.98      4457
   macro avg       0.98      0.92      0.95      4457
weighted avg       0.98      0.98      0.98      4457



In [553]:
print(metrics.classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       968
           1       0.98      0.82      0.90       147

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [580]:
new_sms=["You have won the cash prize of $100"]

In [581]:
dtm=vect.transform(new_sms).todense()
dtm

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [582]:
nb.predict(pd.DataFrame(dtm,columns=vect.get_feature_names()))

array([1], dtype=int64)

######################################################################################################

#### Earlier we have performed the text classification without using the Lmmatization or stemming ,in this ##### part we will repeat the same steps and 

In [31]:
sms = pd.read_csv('D:/Python/Dataset/sms.csv')

In [32]:
sms.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
#### lemmatization on the message column 

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [34]:
def lemmatization(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word,pos='v') for word in word_tokenize(x)])  ### parts of speech as verb

sms['msg_lemmatized'] = sms['message'].apply(lemmatization)

In [35]:
sms=sms[['label','msg_lemmatized']]

In [36]:
sms.head(5)

Unnamed: 0,label,msg_lemmatized
0,ham,"Go until jurong point , crazy .. Available onl..."
1,ham,Ok lar ... Joking wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor ... U c already then sa...
4,ham,"Nah I do n't think he go to usf , he live arou..."


In [37]:
sms['label']=sms['label'].map({'ham':0,'spam':1})

In [38]:
sms

Unnamed: 0,label,msg_lemmatized
0,0,"Go until jurong point , crazy .. Available onl..."
1,0,Ok lar ... Joking wif u oni ...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor ... U c already then sa...
4,0,"Nah I do n't think he go to usf , he live arou..."
...,...,...
5567,1,This be the 2nd time we have try 2 contact u. ...
5568,0,Will ü b go to esplanade fr home ?
5569,0,"Pity , * be in mood for that . So ... any othe..."
5570,0,The guy do some bitch but I act like i 'd be i...


In [39]:
x=sms['msg_lemmatized']
y=sms['label']

In [40]:
#### Train Test split

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


In [43]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(4457,)
(4457,)
(1115,)
(1115,)


In [52]:
import re

def pre_process_text(x):
    x = x.lower()
    x = x.strip()
    x = re.sub(r' +', ' ', x)
    x= re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", x)
    return(x)

In [53]:
from nltk.corpus import stopwords

In [54]:
stop=stopwords.words('english')

In [55]:
vect = TfidfVectorizer(analyzer='word',lowercase=True, preprocessor=pre_process_text, ngram_range=(1,1), max_features=1000, max_df=0.95, min_df=10, stop_words=stop)



In [57]:
# learn training data vocabulary, then create document-term matrix
vect = vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm

<4457x672 sparse matrix of type '<class 'numpy.float64'>'
	with 24179 stored elements in Compressed Sparse Row format>

In [59]:
X_train_dtm.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [60]:
### Feature Names
X_train_tokens = vect.get_feature_names()



In [61]:
X_train_tokens

['able',
 'abt',
 'account',
 'actually',
 'add',
 'address',
 'aft',
 'afternoon',
 'age',
 'ah',
 'aight',
 'almost',
 'alone',
 'already',
 'alright',
 'also',
 'always',
 'amp',
 'ampm',
 'angry',
 'another',
 'answer',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'apply',
 'ard',
 'around',
 'ask',
 'ass',
 'attempt',
 'auction',
 'available',
 'await',
 'award',
 'away',
 'awesome',
 'babe',
 'baby',
 'back',
 'bad',
 'bath',
 'bcoz',
 'beautiful',
 'bed',
 'believe',
 'best',
 'better',
 'big',
 'bill',
 'birthday',
 'bite',
 'bonus',
 'book',
 'bore',
 'bout',
 'box',
 'boy',
 'boytoy',
 'break',
 'bring',
 'brother',
 'bt',
 'bus',
 'busy',
 'buy',
 'ca',
 'call',
 'camcorder',
 'camera',
 'cancel',
 'cant',
 'car',
 'card',
 'care',
 'carlos',
 'case',
 'cash',
 'catch',
 'cause',
 'chance',
 'change',
 'charge',
 'chat',
 'check',
 'chennai',
 'chikku',
 'choose',
 'claim',
 'class',
 'clean',
 'close',
 'club',
 'code',
 'collect',
 'collection',
 'college',
 'colour',
 

In [62]:
doc_term_mat=pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [63]:
doc_term_mat

Unnamed: 0,able,abt,account,actually,add,address,aft,afternoon,age,ah,...,yeah,year,years,yes,yesterday,yet,yo,yr,yrs,yup
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.385692,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.325186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4453,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.319641,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
#### Naive Bayes

In [65]:
# train a Naive Bayes model using X_train_dtm
from sklearn.naive_bayes import MultinomialNB, GaussianNB
nb = MultinomialNB()
#nb = GaussianNB()
nb.fit(doc_term_mat, y_train)

In [66]:
X_test_dtm = vect.transform(X_test)

In [67]:
# make class predictions for X_test_dtm
y_pred_class_train = nb.predict(doc_term_mat)
y_pred_class = nb.predict(X_test_dtm.toarray())



In [68]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.9757847533632287


In [69]:
### as we can see the little accuracy has been incrased after using the Lemmatization in this data

In [70]:
print(metrics.confusion_matrix(y_test, y_pred_class))

[[968   0]
 [ 27 120]]


In [71]:
print(metrics.classification_report(y_train, y_pred_class_train))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3857
           1       0.97      0.85      0.91       600

    accuracy                           0.98      4457
   macro avg       0.98      0.92      0.95      4457
weighted avg       0.98      0.98      0.98      4457



In [72]:
print(metrics.classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       968
           1       1.00      0.82      0.90       147

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



In [73]:
#### Now we will perform the Same steps with using the stemming

In [74]:
sms = pd.read_csv('D:/Python/Dataset/sms.csv')

In [75]:
sms.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Stemming

In [86]:
from nltk.stem import PorterStemmer
def stemmer(x):
    stemmer = PorterStemmer()
    return ''.join([stemmer.stem(x) for x in x])

In [89]:
sms['message']=sms['message'].apply(stemmer)

In [90]:
sms['label']=sms['label'].map({'ham':0,'spam':1})

In [91]:
x=sms['message']
y=sms['label']

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


In [93]:
X_train

1642    hi , where are you? we're at  and they're not ...
2899          if you r @ home then come down within 5 min
480     when're you guys getting back? g said you were...
3485    tell my  bad character which u dnt lik in me. ...
157                           i'm leaving my house now...
                              ...                        
905     we're all getting worried over here, derek and...
5192    oh oh... den muz change plan liao... go back h...
3980    ceri u rebel! sweet dreamz me little buddy!! c...
235     text & meet someone sexy today. u can find a d...
5157                              k k:) sms chat with me.
Name: message, Length: 4457, dtype: object

In [94]:
def pre_process_text(x):
    x = x.lower()
    x = x.strip()
    x = re.sub(r' +', ' ', x)
    x= re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", x)
    return(x)

In [95]:
vect = TfidfVectorizer(analyzer='word',lowercase=True, preprocessor=pre_process_text, ngram_range=(1,1), max_features=1000, max_df=0.95, min_df=10, stop_words=stop)


In [96]:
# learn training data vocabulary, then create document-term matrix
vect = vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm



<4457x705 sparse matrix of type '<class 'numpy.float64'>'
	with 23266 stored elements in Compressed Sparse Row format>

In [97]:
X_train_dtm.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [99]:
### Feature Names
X_train_tokens = vect.get_feature_names()
X_train_tokens

['able',
 'abt',
 'account',
 'actually',
 'address',
 'aft',
 'afternoon',
 'age',
 'ah',
 'aight',
 'almost',
 'alone',
 'already',
 'alright',
 'also',
 'always',
 'amp',
 'ampm',
 'angry',
 'another',
 'answer',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'apply',
 'ard',
 'around',
 'ask',
 'asked',
 'asking',
 'ass',
 'attempt',
 'auction',
 'available',
 'await',
 'award',
 'awarded',
 'away',
 'awesome',
 'babe',
 'baby',
 'back',
 'bad',
 'beautiful',
 'bed',
 'believe',
 'best',
 'better',
 'big',
 'birthday',
 'bit',
 'bonus',
 'book',
 'bored',
 'bout',
 'box',
 'boy',
 'boytoy',
 'break',
 'bring',
 'brother',
 'bt',
 'bus',
 'busy',
 'buy',
 'call',
 'called',
 'calling',
 'calls',
 'camcorder',
 'came',
 'camera',
 'cant',
 'car',
 'card',
 'care',
 'carlos',
 'case',
 'cash',
 'cause',
 'chance',
 'change',
 'charge',
 'chat',
 'check',
 'chikku',
 'choose',
 'claim',
 'class',
 'close',
 'club',
 'code',
 'collect',
 'collection',
 'college',
 'colour',
 'come',
 '

In [100]:
doc_term_mat=pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [101]:
doc_term_mat

Unnamed: 0,able,abt,account,actually,address,aft,afternoon,age,ah,aight,...,yes,yesterday,yet,yo,youll,youre,youve,yr,yrs,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322759,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
# train a Naive Bayes model using X_train_dtm
from sklearn.naive_bayes import MultinomialNB, GaussianNB
nb = MultinomialNB()
#nb = GaussianNB()
nb.fit(doc_term_mat, y_train)

In [103]:
X_test_dtm = vect.transform(X_test)

In [104]:
# make class predictions for X_test_dtm
y_pred_class_train = nb.predict(doc_term_mat)
y_pred_class = nb.predict(X_test_dtm.toarray())



In [105]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.9748878923766816
