## Import packages and Read data

In [1]:
import pandas as pd
import nltk,re,string
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

stopword=nltk.corpus.stopwords.words('english')
st=nltk.SnowballStemmer('english')

In [3]:
data=pd.read_csv('./SMSCollection.csv',
                 sep='\t',names=['label','text'])#encoding='latin-1',error_bad_lines=False,header=None,

data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Create features and Clean text

In [4]:
#create features for a % of the text which is punctuation

def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100
data['text_len']=data['text'].apply(lambda x:len(x)-x.count(" "))
data['punct%']=data['text'].apply(lambda x:count_punct(x))
#data.head()

#function to perform basic cleaning tasks
def clean(text):
    text="".join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W+',text)
    text=[st.stem(word) for word in tokens if word not in stopword]
    return text

## Split into test and train sets

In [5]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(data[['text','text_len','punct%']],data['label'],test_size=0.2)

## Vectorize text

In [6]:
tfidf_vect=TfidfVectorizer(analyzer=clean)
tfidf_vect_fit=tfidf_vect.fit(xtrain['text'])#this vectorizer object has cleaned text

tfidf_train=tfidf_vect_fit.transform(xtrain['text'])#these will be sparse matrices
tfidf_test=tfidf_vect_fit.transform(xtest['text'])

#concat vectorized data with the data coulmns to get features
x_train_vect=pd.concat([xtrain[['text_len','punct%']].reset_index(drop=True),pd.DataFrame(tfidf_train.toarray())],axis=1)
x_test_vect=pd.concat([xtest[['text_len','punct%']].reset_index(drop=True),pd.DataFrame(tfidf_test.toarray())],axis=1)

x_train_vect.head()

Unnamed: 0,text_len,punct%,0,1,2,3,4,5,6,7,...,7125,7126,7127,7128,7129,7130,7131,7132,7133,7134
0,23,17.4,0.484048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55,10.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,51,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Final evaluation of models

In [7]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [8]:
#implement random forest model
rf=RandomForestClassifier(n_estimators=150,max_depth=None,n_jobs=-1)

start=time.time()
rf_model=rf.fit(x_train_vect,ytrain)
end=time.time()
fit_time=end-start

start=time.time()
ypred=rf_model.predict(x_test_vect)
end=time.time()
pred_time=end-start

precision,recall,fscore,support=score(ytest,ypred,pos_label='spam',average='binary')
print('Fit time {},Predict time {}---,precision {},recall {},accuracy {}'.format(round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),
                                                                        round((ytest==ypred).sum()/len(ypred),3)))



Fit time 12.147,Predict time 0.441---,precision 1.0,recall 0.815,accuracy 0.973


In [9]:
#implement gradient boost
gb=GradientBoostingClassifier(n_estimators=150,max_depth=11)

start=time.time()
gb_model=gb.fit(x_train_vect,ytrain)
end=time.time()
fit_time=end-start

start=time.time()
ypred=gb_model.predict(x_test_vect)
end=time.time()
pred_time=end-start

precision,recall,fscore,support=score(ytest,ypred,pos_label='spam',average='binary')
print('Fit time {},Predict time {}---,precision {},recall {},accuracy {}'.format(round(fit_time,3),round(pred_time,3),round(precision,3),round(recall,3),
                                                                        round((ytest==ypred).sum()/len(ypred),3)))



Fit time 5017.386,Predict time 0.25---,precision 0.953,recall 0.87,accuracy 0.975
