In [36]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from text_cleaner import clean_text

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))



  tokens = re.split('\W+', text)


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

In [38]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7110,7111,7112,7113,7114,7115,7116,7117,7118,7119
0,36,30.6,0.254656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,129,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,84,10.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [40]:
X_train_vect.columns = X_train_vect.columns.astype(str)
X_test_vect.columns = X_test_vect.columns.astype(str)
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

rf_model = rf.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.812 / Accuracy: 0.978


In [43]:
import joblib

import dill as pickle
import pandas as pd
from text_cleaner import clean_text
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

# Save the TfidfVectorizer
with open('tfidf_vect.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vect, vectorizer_file)


In [48]:
import dill as pickle
import pandas as pd
from text_cleaner import clean_text


model_filename = 'rf_model.pkl'
vectorizer_filename = 'tfidf_vect.pkl'

with open(model_filename, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open(vectorizer_filename, 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

sen = "I've been searching for the right words to thank you..."
input_df = pd.DataFrame({
    'body_text': [sen],
    'body_len': [len(sen) - sen.count(" ")],
    'punct%': [count_punct(sen)] 
})
print(f"TF-IDF train shape: {tfidf_train.shape}")
print(f"TF-IDF test shape: {tfidf_test.shape}")

tfidf_input_tf = loaded_vectorizer.transform(input_df['body_text'])
print(f"TF-IDF input shape: {tfidf_input_tf.shape}")

input_vect = pd.concat([input_df[['body_len', 'punct%']].reset_index(drop=True), pd.DataFrame(tfidf_input_tf.toarray())], axis=1)
input_vect.columns = input_vect.columns.astype(str)

print(f"Combined input shape: {input_vect.shape}")
print(f"Training data feature count: {X_train_vect.shape[1]}")
print(f"New input data feature count: {input_vect.shape[1]}")




TF-IDF train shape: (4453, 7120)
TF-IDF test shape: (1114, 7120)
TF-IDF input shape: (1, 7120)
Combined input shape: (1, 7122)
Training data feature count: 7122
New input data feature count: 7122


In [49]:
final_prediction = loaded_model.predict(input_vect)
print("Prediction:", final_prediction)

Prediction: ['ham']
