### Importing Libraries and Loading Data

In [1]:
import pandas as pd
import re
import string
import nltk

pd.set_option('display.max_colwidth', 100)

ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

stopword = nltk.corpus.stopwords.words('english')

df = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
df.columns = ['label', 'email']

df.head()

Unnamed: 0,label,email
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


### Cleaning Dataframe Email(text)

In [2]:
def clean_text(text):
    
    ## Removing punctuation like ' !"#$%&\'()*+, '
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    
    ## Tokenizing the email text
    tokens = re.split('\W+', text)
    
    ## Removing Stopwords like 'he,him,is,a,for,the,as,but,do etc'
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

### Coverting raw text to Vectors

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['email'])

In [4]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature Creation and Spliting into Features & labels

In [5]:
import string

def punct_count(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df['email_len'] = df['email'].apply(lambda x: len(x) - x.count(" "))
df["%punct"] = df['email'].apply(lambda x: punct_count(x))
df.head()

Unnamed: 0,label,email,email_len,%punct
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,128,4.7
1,ham,"Nah I don't think he goes to usf, he lives around here though",49,4.1
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,62,3.2
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,135,4.4


In [6]:
## Features and Labels

X_features = pd.concat([df['email_len'], df['%punct'], X_tfidf_df], axis=1)
labels = df['label']

### Coverting Data into train and test split

In [7]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_features, labels, test_size=0.4)

### Using RandomForest Classifier to Train and Evaluate

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
model = rf.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)
print(y_pred)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


### Evaluating Model and Scores

In [11]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [12]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred), 3)))

Precision: 1.0 / Recall: 0.539 / Accuracy: 0.941


### Checking Models trained for Overfitting on Train and Test Split

In [13]:
X_features_pred = model.predict(X_train)
print(X_features_pred)

['ham' 'spam' 'ham' ... 'ham' 'ham' 'ham']


In [14]:
precision, recall, fscore, support = score(y_train, X_features_pred, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((X_features_pred==y_train).sum() / len(X_features_pred), 3)))

Precision: 1.0 / Recall: 0.686 / Accuracy: 0.957


### Basically The Range Of Accuracy for Both X_features traing data and Labels Test Data is very Similar, which concludes that the Spliting of data is not "overfitting" by any means.

### Evaluating RandomForest Classifier Using Cross-Validation

In [15]:
from sklearn.model_selection import KFold, cross_val_score

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rfc, X_features, labels, cv = k_fold, scoring='accuracy', n_jobs=-1)