<a href="https://colab.research.google.com/github/ashkanb77/polarity-detection/blob/main/polarity_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
from nltk.corpus import stopwords
from nltk import word_tokenize, download
from string import punctuation

In [None]:
zipfile.ZipFile('data.zip').extractall('')

In [None]:
download('stopwords')
download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
neg_docs = []
neg_max = 0
for file_name in os.listdir('data/neg'):
  with open('data/neg/' + file_name) as file:
    txt = file.read()
    tokens = word_tokenize(txt)
    trans = str.maketrans('', '', punctuation)
    tokens = [w.translate(trans) for w in tokens if not w in stop_words]
    if len(tokens) > neg_max:
      neg_max = len(tokens)
    neg_docs.append(' '.join(tokens))
len(neg_docs)

1000

In [None]:
pos_docs = []
pos_max = 0
for file_name in os.listdir('data/pos'):
  with open('data/pos/' + file_name) as file:
    txt = file.read()
    tokens = word_tokenize(txt)
    trans = str.maketrans('', '', punctuation)
    tokens = [w.translate(trans) for w in tokens if not w in stop_words]
    if len(tokens) > pos_max:
      pos_max = len(tokens)
    pos_docs.append(' '.join(tokens))
len(pos_docs)

1000

In [None]:
import random
random.shuffle(pos_docs)
random.shuffle(neg_docs)

In [None]:
X_train = neg_docs[:800] + pos_docs[:800]
y_train = [0 for _ in range(800)] + [1 for _ in range(800)]

In [None]:
X_test = neg_docs[800:] + pos_docs[800:]
y_test = [0 for _ in range(200)] + [1 for _ in range(200)]

In [None]:
max_len = max(pos_max, neg_max)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPool1D, Input, Embedding, concatenate


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
tokenizer.word_index

{'s': 1,
 'film': 2,
 'nt': 3,
 'movie': 4,
 'one': 5,
 'like': 6,
 'even': 7,
 'good': 8,
 'time': 9,
 'would': 10,
 'story': 11,
 'much': 12,
 'character': 13,
 'also': 14,
 'get': 15,
 'characters': 16,
 'two': 17,
 'first': 18,
 'see': 19,
 'well': 20,
 'way': 21,
 'make': 22,
 'could': 23,
 'really': 24,
 'films': 25,
 'plot': 26,
 'life': 27,
 'little': 28,
 'people': 29,
 'scene': 30,
 'bad': 31,
 'never': 32,
 'man': 33,
 'best': 34,
 'many': 35,
 'scenes': 36,
 'new': 37,
 'know': 38,
 'movies': 39,
 'great': 40,
 'another': 41,
 'love': 42,
 'director': 43,
 'us': 44,
 'something': 45,
 'go': 46,
 'end': 47,
 'action': 48,
 'still': 49,
 'back': 50,
 'made': 51,
 'makes': 52,
 'world': 53,
 'work': 54,
 're': 55,
 'seems': 56,
 'however': 57,
 'big': 58,
 'though': 59,
 'every': 60,
 'better': 61,
 'enough': 62,
 'audience': 63,
 'seen': 64,
 'around': 65,
 'take': 66,
 'performance': 67,
 'gets': 68,
 'may': 69,
 'real': 70,
 'think': 71,
 'role': 72,
 'years': 73,
 'things'

In [None]:
vocab_len = len(tokenizer.word_index) + 1
vocab_len

40582

In [None]:
encoded = tokenizer.texts_to_sequences(X_train)

In [None]:
padded = pad_sequences(encoded, maxlen=max_len, padding='post')