In [24]:
import nltk
import glob
import os
import random
import time
import numpy as np 
import pickle
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import MAE, MSE
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [21]:
DATA_DIR = "/Users/AlexPowers/projects/thee-flying-chicken/ml_scraping/data/"
PROJ_DIR = "/Users/AlexPowers/projects/thee-flying-chicken/ml_scraping/"

In [3]:
negMovie = []
posMovie = []
for f_name in glob.glob(os.path.join(DATA_DIR,"movie_reviews","neg","*")):
    negMovie.append(open(f_name, 'r').read())

for f_name in glob.glob(os.path.join(DATA_DIR,"movie_reviews", "pos", "*")):
    posMovie.append(open(f_name,'r').read())

In [4]:
negShort = open(os.path.join(DATA_DIR, "pos_neg_data", "neg.txt"), 'r').read()
posShort = open(os.path.join(DATA_DIR, "pos_neg_data", "pos.txt"), 'r').read()


In [60]:
documents = []
all_words = []
#  j is adjective, r is adverb, and v is verb
lemmatizer = WordNetLemmatizer()

for r in negShort.split("\n"):
    documents.append( (r, 1) )
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        all_words.append(lemmatizer.lemmatize( w[0].lower()) )
for r in posShort.split("\n"):
    documents.append( (r, 0) )
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        all_words.append(lemmatizer.lemmatize( w[0].lower()) )


for r in negMovie:
    documents.append( (r, 1) )
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        all_words.append(lemmatizer.lemmatize( w[0].lower()) )

for r in posMovie:
    documents.append( (r, 0) )
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        all_words.append(lemmatizer.lemmatize( w[0].lower()) )

In [61]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:len(all_words.keys())//5]

In [62]:
def find_features(document):
    words = word_tokenize(document)
    lem_words = [lemmatizer.lemmatize(w) for w in words]
    features = []
    for w in word_features:
        features.append(int(w in lem_words))

    return features

In [63]:
len(documents)

12664

In [64]:
random.shuffle(documents)
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [65]:
x_train, x_test, y_train, y_test = train_test_split(np.array([i[0] for i in featuresets]), np.array([i[1] for i in featuresets]), test_size=0.2, random_state=42)

In [66]:
x_train.shape

(10131, 9423)

In [67]:
y_train.shape

(10131,)

In [68]:
x_train.min(), x_train.max()

(0, 1)

In [69]:
y_train.min(), y_train.max()

(0, 1)

In [70]:
model = tf.keras.Sequential()
IN_DIM=len(x_train[0])

In [85]:
model.add(keras.layers.Dense(units=1024, activation='relu', input_dim=IN_DIM))
model.add(keras.layers.Dense(units=521, activation='relu'))
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=256, activation='relu'))
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=1, activation='relu'))
model.compile(loss=tf.keras.losses.mse, optimizer='adam',metrics=[MAE, MSE, 'accuracy'])

In [86]:
curr_time = time.strftime("%Y-%m-%d--%H-%M")
MODEL_DIR = os.path.join(PROJ_DIR, "model", curr_time)
LOG_DIR = os.path.join(PROJ_DIR, "out", curr_time)
for d in [MODEL_DIR, LOG_DIR]:
    if not os.path.isdir(d):
        os.mkdir(d)

In [83]:
model_checkpoint = ModelCheckpoint("%s/weights.{epoch:02d}-{loss:.2f}.hdf5" % (MODEL_DIR))
tensor_board = TensorBoard(log_dir=LOG_DIR, write_graph=True)

In [87]:
model.fit(x_train, y_train, batch_size=100, epochs=25,validation_data=(x_test,y_test), callbacks=[model_checkpoint, tensor_board])

Train on 10131 samples, validate on 2533 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f1073c1eb70>

In [98]:
print(classification_report(y_test, np.round(model.predict(x_test))))

              precision    recall  f1-score   support

           0       0.71      0.84      0.77      1256
           1       0.81      0.65      0.72      1277

   micro avg       0.75      0.75      0.75      2533
   macro avg       0.76      0.75      0.75      2533
weighted avg       0.76      0.75      0.75      2533

