# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['meet', 'lunch', 'la']"
1,"['lovely', 'smell', 'bus', 'aint', 'tobacco', '']"
2,"['rose', 'redred', 'bloodblood', 'heartheart',..."
3,"['yeah', 'usual', 'guys', 'town', 'therere', '..."
4,"['call', 'meet']"


### Create TF-IDF Vectors

In [3]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [5]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'meet': 4712,
 'lunch': 4543,
 'la': 4250,
 'lovely': 4503,
 'smell': 6658,
 'bus': 1675,
 'aint': 969,
 'tobacco': 7385,
 'rose': 6219,
 'redred': 6038,
 'bloodblood': 1492,
 'heartheart': 3566,
 'send': 6412,
 'tis': 7363,
 'ur': 7662,
 'friends': 3176,
 'including': 3858,
 'like': 4381,
 'get': 3291,
 'back': 1280,
 '1u': 366,
 'poor': 5682,
 'relation': 6063,
 '2u': 457,
 'need': 5052,
 'support': 7058,
 '3u': 520,
 'frnd': 3184,
 'many': 4638,
 '4some1': 577,
 'luvs': 4551,
 'some1': 6709,
 'praying': 5743,
 'god': 3338,
 'marry': 4656,
 'try': 7519,
 'yeah': 8216,
 'usual': 7693,
 'guys': 3470,
 'town': 7463,
 'therere': 7279,
 'definitely': 2395,
 'people': 5512,
 'around': 1149,
 'know': 4217,
 'call': 1715,
 'nothing': 5174,
 'splwat': 6816,
 'abt': 838,
 'whr': 7981,
 'ru': 6239,
 'real': 5990,
 'tho': 7308,
 'sucks': 7015,
 'cant': 1764,
 'even': 2824,
 'cook': 2139,
 'whole': 7978,
 'electricity': 2732,
 'im': 3829,
 'hungry': 3766,
 'donno': 2573,
 'genes': 3280,
 'someth

In [9]:
from itertools import islice
# Assuming `tfidf_vect.vocabulary_` is the dictionary you want to print
for key, value in islice(tfidf_vect.vocabulary_.items(), 10):
    print(key, value)

meet 4712
lunch 4543
la 4250
lovely 4503
smell 6658
bus 1675
aint 969
tobacco 7385
rose 6219
redred 6038


In [11]:
# How are these vectors stored?
X_test_vect[0]

<1x8321 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [12]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [13]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [14]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [16]:
y_train

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
4452,0
4453,0
4454,1
4455,0


In [15]:
y_train.values.ravel()

array([0, 0, 0, ..., 1, 0, 0])

In [17]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.76 / Accuracy: 0.968
