# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['dear', 'call']"
1,"['jus', 'came', 'back', 'fr', 'lunch', 'wif', ..."
2,"['played', 'smash', 'bros', 'ltgt', 'religious..."
3,"['asked', 'hows', 'anthony', 'dad', 'bf']"
4,"['slow', 'using', 'biolas', 'fne']"


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [6]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'dear': 2333,
 'call': 1701,
 'jus': 4125,
 'came': 1731,
 'back': 1263,
 'fr': 3135,
 'lunch': 4552,
 'wif': 7968,
 'sis': 6579,
 'leh': 4351,
 'played': 5606,
 'smash': 6644,
 'bros': 1610,
 'ltgt': 4539,
 'religiously': 6063,
 'asked': 1161,
 'hows': 3735,
 'anthony': 1055,
 'dad': 2267,
 'bf': 1410,
 'slow': 6634,
 'using': 7680,
 'biolas': 1431,
 'fne': 3080,
 'urgent': 7659,
 'costa': 2143,
 'del': 2375,
 'sol': 6690,
 'holiday': 3677,
 '5000': 597,
 'await': 1233,
 'collection': 2010,
 '09050090044': 162,
 'toclaim': 7383,
 'sae': 6263,
 'tc': 7164,
 'pobox334': 5641,
 'stockport': 6914,
 'sk38xh': 6596,
 'costå': 2148,
 '150pm': 315,
 'max10mins': 4685,
 'dunno': 2637,
 'lei': 4353,
 'might': 4786,
 'eatin': 2669,
 'frens': 3165,
 'wan': 7833,
 'eat': 2667,
 'wait': 7811,
 'lar': 4297,
 'hiya': 3650,
 'comin': 2029,
 'bristol': 1602,
 'st': 6842,
 'week': 7897,
 'april': 1103,
 'les': 4362,
 'got': 3385,
 'rudi': 6234,
 'new': 5081,
 'yrs': 8261,
 'eve': 2812,
 'snoringthey': 

In [7]:
# How are these vectors stored?
X_test_vect[0]

<1x8296 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [8]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [9]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [10]:
# Use the trained model to make predictions on the test data
y_pred=rf_model.predict(X_test_vect)

In [11]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.752 / Accuracy: 0.968
