# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['brainless', 'baby', 'dolld', 'vehicle', 'sar..."
1,"['nimbomsons', 'yep', 'phone', 'knows', 'one',..."
2,"['record', 'one', 'night', '']"
3,"['went', 'attend', 'another', 'two', 'rounds',..."
4,"['hi', 'juan', 'im', 'coming', 'home', 'fri', ..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'brainless': 1540,
 'baby': 1255,
 'dolld': 2523,
 'vehicle': 7678,
 'sariyag': 6253,
 'drive': 2577,
 'madoke': 4538,
 'barolla': 1297,
 'nimbomsons': 5049,
 'yep': 8153,
 'phone': 5492,
 'knows': 4175,
 'one': 5236,
 'obviously': 5176,
 'cos': 2119,
 'thats': 7207,
 'real': 5928,
 'word': 8000,
 'record': 5966,
 'night': 5041,
 'went': 7873,
 'attend': 1192,
 'another': 1046,
 'two': 7506,
 'rounds': 6169,
 'todaybut': 7339,
 'still': 6864,
 'didt': 2421,
 'reach': 5916,
 'home': 3640,
 'hi': 3583,
 'juan': 4056,
 'im': 3790,
 'coming': 2010,
 'fri': 3133,
 'hey': 3578,
 'course': 2140,
 'expect': 2841,
 'welcome': 7863,
 'party': 5401,
 'lots': 4436,
 'presents': 5711,
 'ill': 3787,
 'get': 3253,
 'back': 1260,
 'loads': 4385,
 'love': 4447,
 'nicky': 5037,
 'kallis': 4083,
 'ready': 5926,
 'bat': 1308,
 '2nd': 430,
 'innings': 3861,
 'arun': 1146,
 'transfr': 7428,
 'amt': 1021,
 'short': 6471,
 'cute': 2226,
 'good': 3325,
 'person': 5463,
 'nah': 4943,
 'dub': 2601,
 'je': 3998,

In [4]:
# How are these vectors stored?
X_test_vect[0]                  # very sparse vector

<1x8240 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [6]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [7]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())  # ravel() flattens the array

In [8]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.807 / Accuracy: 0.97


In [None]:
# 100% precision -> when it idenitfied an item it was always correct
# 80% recall -> correctly identifies 80% of the spam that exists
# 97% accuracy -> correctly identifies 97% of all items