In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import pickle
from collections import Counter
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import gc


In [2]:
def create_lexicon (directory=os.getcwd(), max_quant=0.998, min_quant=0.972, visualize=False):
    
    all_words = []
    lemmatizer = WordNetLemmatizer()

    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(filename, 'r') as file:
                for line in file:
                    words_in_line = word_tokenize(line)
                    all_words += words_in_line
        
    all_words_lem = [lemmatizer.lemmatize(word) for word in all_words]
    all_words_cnt = Counter(all_words_lem)
    
    freq_list = [freq for _ , freq in all_words_cnt.items()]
    max_freq = np.quantile(freq_list, max_quant)
    min_freq = np.quantile(freq_list, min_quant)

    all_words_filt = {word : freq for word, freq in all_words_cnt.items() if  min_freq < freq < max_freq}
    freq_list_filt = [freq for _ , freq in all_words_filt.items()]
    words_list_filt = [word for word , _ in all_words_filt.items()]
    
    if visualize:
        print ('number of words in the filtered dictionary:', len(freq_list_filt))
        print ('max repetitions considered:', max_freq)
        print ('min repetitions considered:', min_freq)
        plt.hist(freq_list_filt, bins = len(freq_list_filt))
        plt.show()
    
    return words_list_filt

In [3]:
def create_dataframe (lexicon, sample, is_positive):
    
    lemmatizer = WordNetLemmatizer()
    dataset = []
    label = 1 if is_positive else 0
    
    with open(sample, 'r') as file:
        for line in file:
            line_tokenized = word_tokenize(line)
            line_lemmatized = [lemmatizer.lemmatize(line_tokenized) for line_tokenized in line_tokenized]
            feature = np.zeros(len(lexicon))
            
            for word in line_lemmatized:
                if word in lexicon:
                    feature[lexicon.index(word)] += 1
                    
            dataset.append([feature, label])
        
        dataframe = pd.DataFrame(dataset, columns=['text_encod', 'label']) # .reset_index(drop = True)

    return dataframe

In [4]:
def merge_n_split_df(pos_file, neg_file):
    
    lexicon = create_lexicon()

    pos_df = create_dataframe(lexicon, pos_file, is_positive = True)
    neg_df = create_dataframe(lexicon, neg_file, is_positive = False)

    full_dataframe = pd.concat([pos_df, neg_df]) #, ignore_index = True
    
    train_x, test_x, train_y, test_y  = train_test_split(full_dataframe['text_encod'], full_dataframe['label'], 
                                                        test_size=0.1, shuffle=True)
    

    return train_x, test_x, train_y, test_y

In [5]:
# if __name__ == '__main__':

train_x, test_x, train_y, test_y = merge_n_split_df(pos_file = 'pos.txt', neg_file = 'neg.txt')
merge_n_split_df(pos_file = 'pos.txt', neg_file = 'neg.txt')    

    # with open('sentiment_set.pickle','wb') as file:
        # pickle.dump([train_x,test_x,train_y,test_y],file)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 4965: character maps to <undefined>

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=2000, 
                                activation='relu', 
                                input_shape=(train_x.iloc[0].shape), # number of words in the lexicon (to reshape accordingly)
                                kernel_initializer='glorot_uniform')) #uniform distribution weights initialization
print(train_x[0].shape)
model.add(tf.keras.layers.Dense(4000))
# model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(8000))
# model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(16000))
model.add(tf.keras.layers.Dense(2, activation='softmax')) # 2 (pos or neg) output (to one-hot encode accordingly)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', 
              optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001), 
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

early_stopping_monitor = EarlyStopping(monitor='accuracy', patience=3)

x_train_resh = np.vstack(train_x)
y_train_resh = to_categorical(train_y, num_classes=2)

history = model.fit(x_train_resh,
                    y_train_resh,
                    epochs=50,
                    batch_size = 100,
                    callbacks=[early_stopping_monitor])

In [None]:
y_pred_perc = model(np.vstack(test_x))
y_pred = tf.argmax(y_pred_perc, axis = 1)
y_test = [1 if y == [0, 1] else 0 for y in test_y]
test_acc = np.mean(y_pred==y_test)
test_acc

In [None]:
model1 = tf.keras.models.Sequential()
model1.add(tf.keras.layers.Dense(units=10000, 
                                activation='relu', 
                                input_shape=(train_x.iloc[0].shape), # number of words in the lexicon (to reshape accordingly)
                                kernel_initializer='glorot_uniform')) #uniform distribution weights initialization
print(train_x[0].shape)
model1.add(tf.keras.layers.Dense(10000))
model1.add(tf.keras.layers.Dense(10000))
model1.add(tf.keras.layers.Dense(2, activation='softmax')) # 2 (pos or neg) output (to one-hot encode accordingly)
model1.summary()

In [None]:
model1.compile(loss='categorical_crossentropy', 
              optimizer = tf.keras.optimizers.Adam(), 
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

early_stopping_monitor = EarlyStopping(monitor='accuracy', patience=3)

x_train_resh = np.vstack(train_x)
y_train_resh = to_categorical(train_y, num_classes=2)

history = model1.fit(x_train_resh,
                    y_train_resh,
                    epochs=50,
                    batch_size = 1000,
                    callbacks=[early_stopping_monitor])

In [None]:
y_pred_perc1 = model1(np.vstack(test_x))
y_pred1 = tf.argmax(y_pred_perc1, axis = 1)
y_test1 = [1 if y == [0, 1] else 0 for y in test_y]
test_acc1 = np.mean(y_pred1==y_test1)
test_acc1

In [None]:
model2 = tf.keras.models.Sequential()
model2.add(tf.keras.layers.Dense(units=1000, 
                                activation='relu', 
                                input_shape=(train_x.iloc[0].shape), # number of words in the lexicon (to reshape accordingly)
                                kernel_initializer='glorot_uniform')) #uniform distribution weights initialization
print(train_x[0].shape)
model2.add(tf.keras.layers.Dense(1000))
model2.add(tf.keras.layers.Dense(1000))
model2.add(tf.keras.layers.Dense(1000))
model2.add(tf.keras.layers.Dense(1000))
model2.add(tf.keras.layers.Dense(2, activation='softmax')) # 2 (pos or neg) output (to one-hot encode accordingly)
model2.summary()

In [None]:
model2.compile(loss='categorical_crossentropy', 
              optimizer = tf.keras.optimizers.Adam(), 
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

early_stopping_monitor = EarlyStopping(monitor='accuracy', patience=3)

x_train_resh = np.vstack(train_x)
y_train_resh = to_categorical(train_y, num_classes=2)

history = model2.fit(x_train_resh,
                    y_train_resh,
                    epochs=50,
                    # batch_size = 1000,
                    callbacks=[early_stopping_monitor])

In [None]:
y_pred_perc2 = model2(np.vstack(test_x))
y_pred2 = tf.argmax(y_pred_perc2, axis = 1)
y_test2 = [0 if y == [0, 1] else 1 for y in test_y]
test_acc2 = np.mean(y_pred2==y_test2)
test_acc2

In [None]:
gc.collect()