# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [9]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['hello', 'love', 'get', 'interview', 'today',..."
1,"['know', 'pls', 'open', 'back']"
2,"['case', 'wake', 'wondering', 'forgot', 'take'..."
3,"['gran', 'onlyfound', 'afew', 'days', 'agocuso..."
4,"['please', 'call', 'customer', 'service', 'rep..."


### Create TF-IDF Vectors

In [10]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [11]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'hello': 3584,
 'love': 4488,
 'get': 3296,
 'interview': 3934,
 'today': 7326,
 'happy': 3528,
 'good': 3364,
 'boy': 1575,
 'think': 7235,
 'meare': 4689,
 'missing': 4808,
 'know': 4218,
 'pls': 5597,
 'open': 5286,
 'back': 1291,
 'case': 1797,
 'wake': 7760,
 'wondering': 7987,
 'forgot': 3125,
 'take': 7076,
 'care': 1773,
 'something': 6673,
 'grandma': 3411,
 'done': 2565,
 'parade': 5403,
 'gran': 3408,
 'onlyfound': 5279,
 'afew': 918,
 'days': 2332,
 'agocusoon': 948,
 'honi': 3689,
 'please': 5589,
 'call': 1714,
 'customer': 2267,
 'service': 6398,
 'representative': 6074,
 '0800': 43,
 '169': 342,
 '6031': 633,
 '10am9pm': 266,
 'guaranteed': 3451,
 '1000': 252,
 'cash': 1798,
 '5000': 594,
 'prize': 5773,
 'yeah': 8146,
 'worse': 8023,
 'great': 3423,
 'trip': 7438,
 'india': 3879,
 'bring': 1616,
 'light': 4373,
 'everyone': 2835,
 'project': 5802,
 'lucky': 4526,
 'see': 6353,
 'smile': 6620,
 'bye': 1696,
 'abiola': 835,
 'ic': 3806,
 'lotta': 4480,
 'childporn': 192

In [12]:
# How are these vectors stored?
X_test_vect[0]

<1x8247 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [13]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [14]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [15]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [16]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.826 / Accuracy: 0.978
