In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the datasets
titles = pd.read_csv('basics.tsv', sep='\t')
ratings = pd.read_csv('ratings.tsv', sep='\t')

# Merge the datasets
data = pd.merge(titles, ratings, on='tconst')

# Filter for movies only
movies = data[data['titleType'] == 'movie']

# Extract relevant columns
reviews = movies[['originalTitle', 'genres', 'primaryTitle', 'isAdult', 'averageRating', 'numVotes']]

# For simplicity, let's consider reviews with a significant number of votes
reviews = reviews[reviews['numVotes'] > 1000]

# Assign positive/negative sentiment based on average rating
reviews['sentiment'] = reviews['averageRating'].apply(lambda x: 'positive' if x >= 7 else 'negative')

# Split the data into training and testing sets
train_data, test_data = train_test_split(reviews, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['originalTitle'])

train_sequences = tokenizer.texts_to_sequences(train_data['originalTitle'])
test_sequences = tokenizer.texts_to_sequences(test_data['originalTitle'])

# Pad sequences to a fixed length
max_length = 50
train_sequences = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['sentiment'])
test_labels = label_encoder.transform(test_data['sentiment'])


In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, MultiHeadAttention, LayerNormalization, TimeDistributed

# Define a simple transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([
            TimeDistributed(Dense(ff_dim, activation='relu')),
            TimeDistributed(Dense(embed_dim))
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)

        return self.layernorm2(out1 + ffn_output)

# Define your transformer model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    TransformerBlock(embed_dim=128, num_heads=2, ff_dim=32),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [10]:
model.fit(train_sequences, train_labels, epochs=5, validation_data=(test_sequences, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x795e843573a0>

In [11]:
loss, accuracy = model.evaluate(test_sequences, test_labels)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 1.2822813987731934, Test Accuracy: 0.5623869895935059
