In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re 
import nltk
import string
import pickle
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

### 1) Load data and concat them

In [27]:
df = pd.read_table(r'../Data/cleaned_reviews.csv', delimiter = ',')

df.head()

Unnamed: 0,Sentiment,Class
0,crust good,0
1,tasty texture nasty,0
2,stopped late may bank holiday rick steve recom...,1
3,selection menu great prices,1
4,getting angry want damn pho,0


In [28]:
df.dropna(inplace=True)

# Vectorization

use the CountVectorizer to vectorize sentences. It takes the words of each sentence and creates a vocabulary of all the unique words in the sentences. This vocabulary can then be used to create a feature vector of the count of the words:

In [29]:
# Determine data and target
X = df['Sentiment']
y = df.iloc[:, -1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [30]:
# Splitting the dataset into the Training, validation and Test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [31]:

'''
# I will use TF-IDF method to extract the text features.

tf_vec = TfidfVectorizer(tokenizer=None, stop_words=None, max_df=0.75, max_features=2000, lowercase=False,
                         ngram_range=(1,2), use_idf=False, sublinear_tf=True, min_df=5, norm='l2',
                         encoding='latin-1')
'''

## Vectorization of data
## Vectorize the data using Bag of words (BOW)

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)


train_features = vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)


print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of train_vectors:',train_features.shape)
print('Shape of test_vectors:',test_features.shape)

Shape of X_train: (2193,)
Shape of X_test: (549,)
Shape of train_vectors: (2193, 4584)
Shape of test_vectors: (549, 4584)




This vocabulary serves also as an index of each word. Now, you can take each sentence and get the word occurrences of the words based on the previous vocabulary. The vocabulary consists of all five words in our sentences, each representing one word in the vocabulary. When you take the previous two sentences and transform them with the CountVectorizer you will get a vector representing the count of each word of the sentence:

#### Save Vectors

In [32]:
#  save BagofWords vectorizer
with open('../vectors/train_vector.pkl', 'wb') as handle:
    pickle.dump(train_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../vectors/test_vector.pkl', 'wb') as handle:
    pickle.dump(test_features, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [33]:
#  save labels vectorizer
with open('../vectors/y_train.pkl', 'wb') as handle:
    pickle.dump(y_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../vectors/y_test.pkl', 'wb') as handle:
    pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL)