# Bag of Words Meets Bags of Popcorn
## 2. Bag of Words
#### Kaggle NLP Training

In [43]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [23]:
# Read training data into memory. Data was created in Step 1 notebook: Prep Data
train = pd.read_csv('clean_labeled_train.tsv', delimiter='\t')

In [24]:
train.head()

Unnamed: 0,id,sentiment,review,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff go moment mj start listen music watch od...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war worlds timothy hines entertain fil...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film start manager nicholas bell give welcome ...
3,3630_4,0,It must be assumed that those who praised this...,must assume praise film greatest film opera ev...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...


In [25]:
# Create a Bag-of-Words using Scikit-Learn's CountVectorizer
vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)

In [26]:
train_data_features = vectorizer.fit_transform(train['clean_review']).toarray()

In [27]:
train_data_features.shape

(25000, 5000)

In [28]:
vocab = vectorizer.get_feature_names_out()
vocab

array(['abandon', 'abc', 'abilities', ..., 'zombies', 'zone', 'zoom'],
      dtype=object)

In [29]:
# Print the 5000 words and how many times they show up
#dist = np.sum(train_data_features, axis=0)
#for tag, count in zip(vocab, dist):
#    print(count, tag)

In [31]:
model = RandomForestClassifier(n_estimators = 100) 
model = model.fit(train_data_features, train['sentiment'])

In [32]:
test = pd.read_csv('clean_labeled_test.tsv', delimiter = '\t')

In [33]:
test.head()

Unnamed: 0,id,review,clean_revew
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main theme mortality nostalgia ...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kid saw tonight child love one point kid...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark leave impression several different...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction small time mob life film ne...


In [34]:
test.shape

(25000, 3)

In [36]:
test_data_features = vectorizer.transform(test['clean_revew']).toarray()

In [37]:
test_data_features.shape

(25000, 5000)

In [38]:
# Use the Random Forest model to predict the sentiment on the test set
ans = model.predict(test_data_features)

In [40]:
ans

array([1, 0, 1, ..., 0, 1, 1])

In [41]:
submission = pd.DataFrame(data={'id':test['id'], 'sentiment':ans})
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [45]:
# Print CSV of scores
filename = 'submissions/submission_'+dt.datetime.now().strftime("%Y%m%d-%H%M%S")+'.csv'
submission.to_csv(filename, header=True, index=False)