<a href="https://colab.research.google.com/github/bernardev254/sentiments_analysis/blob/main/ReviewsClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -O wikiword2vec_model -c "https://zenodo.org/records/6542975/files/wiki_300_5_word2vec.model?download=1"
!wget -O wikiword2vec_model.wv.vectors.npy -c "https://zenodo.org/records/6542975/files/wiki_300_5_word2vec.model.wv.vectors.npy?download=1"
!wget -O wikiword2vec_model.syn1neg.npy -c "https://zenodo.org/records/6542975/files/wiki_300_50_word2vec.model.syn1neg.npy?download=1"

In [None]:
import pandas as pd
import tensorflow as tf

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim
import numpy as np
from keras.metrics import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, SimpleRNN

def build_model():
    model = Sequential()
    #model.add(Embedding(max_words, 100, input_length=maxlen))
    model.add(Flatten(input_shape=(100, 300)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc',F1Score()])
    return model

class ReviewsDataset(tf.keras.utils.Sequence):
    def __init__(self, data, labels, word2vec_model, max_len, batch_size):
        self.data = data
        self.labels = labels
        self.word2vec_model = word2vec_model
        self.max_len = max_len
        self.batch_size = batch_size

    def __len__(self):
        return len(self.data) // self.batch_size

    def __getitem__(self, idx):
        batch_data = self.data[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_labels = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]

        embeddings = []
        for review in batch_data:
            tokens = review.split()[:self.max_len]
            embedding = [self.word2vec_model.wv[token] if token in self.word2vec_model.wv else np.zeros(self.word2vec_model.vector_size) for token in tokens]
            #embedding = [self.word2vec_model[token] if token in self.word2vec_model else np.zeros(self.word2vec_model.vector_size) for token in tokens]
            embeddings.append(embedding)
        embeddings = tf.keras.preprocessing.sequence.pad_sequences(embeddings, maxlen=self.max_len, padding='post')

        return embeddings, batch_labels

#load the word2vec model
#word2vec_model=gensim.models.KeyedVectors.load_word2vec_format("wikiword2vec")
word2vec_model = Word2Vec.load("wikiword2vec_model")

# Load and preprocess data

# Load training data
train_data = pd.read_csv('dev.tsv', sep='\t',header=0,index_col=0)
train_reviews = []
for i in range(len(train_data)):
    train_review = train_data.iloc[i, 0]  # Assuming review is at index 1
    train_reviews.append(train_review)

train_labels = train_data['rating']
train_labels = train_labels.reset_index(drop=True).tolist()

#load dev data
dev_data = pd.read_csv('dev.tsv', sep='\t',header=0,index_col=0)
dev_reviews = []
for i in range(len(train_data)):
    dev_review = train_data.iloc[i, 0]  # Assuming review is at index 1
    dev_reviews.append(dev_review)
#load dev labels
dev_labels = dev_data['rating']
dev_labels = dev_labels.reset_index(drop=True).tolist()

#Load the unlabeled test data
test_data = pd.read_csv('test_YourLastName_UID.tsv', sep='\t',header=0,index_col=0)
test_reviews = []
for i in range(len(test_data)):
    test_review = train_data.iloc[i, 0]  # Assuming review is at index 1
    test_reviews.append(test_review)

test_labels = test_data['rating']

# Reshape the labels to match the model's output shape
train_labels = np.expand_dims(train_labels, axis=1)
dev_labels = np.expand_dims(dev_labels, axis=1)


# Define hyperparameters
embedding_dim = word2vec_model.vector_size
hidden_dim = 128
output_size = 1
num_layers = 1
max_len = 100
batch_size = 32
num_epochs = 10
learning_rate = 0.001

# Convert the labels to float32
train_labels = tf.cast(train_labels, tf.float32)
dev_labels = tf.cast(dev_labels, tf.float32)
#test_labels = tf.cast(test_labels, tf.float32)

# Create datasets
train_dataset = ReviewsDataset(train_reviews, train_labels, word2vec_model, max_len, batch_size)
val_dataset = ReviewsDataset(dev_reviews, dev_labels, word2vec_model, max_len, batch_size)
test_dataset = ReviewsDataset(test_reviews, test_labels, word2vec_model, max_len, batch_size)


model = build_model()
model.summary()
history = model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

# Save the model
model.save('my_model.keras')




