# Importing Data

In [1]:
import pandas as pd

train_df = pd.read_csv(r"C:\Users\ok\Desktop\archive\Genre Classification Dataset\train_data.txt", engine="python", sep=" ::: ", names=["id", "movie", "genre", "summary"])

In [2]:
# Viewing training data
train_df.head()

Unnamed: 0,id,movie,genre,summary
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


# Data Manipulation

## Shuffling the data

In [3]:
train_shuffled = train_df.sample(frac=1)

## Splitting the data

In [4]:
# Split the data using train_test_split from sklearn
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_shuffled["summary"],
                                                 train_shuffled["genre"],
                                                 test_size=0.1)
#X_test, y_test = test_df["summary"], test_df["genre"]

## One-Hot Encoding (Labels)

In [5]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
ohe.fit(train_shuffled["genre"].to_numpy().reshape(-1,1)) # Fit the encoder to genre of training data

train_ohe = ohe.transform(y_train.to_numpy().reshape(-1, 1))
val_ohe = ohe.transform(y_val.to_numpy().reshape(-1,1))
#test_ohe = ohe.transform(y_test.to_numpy().reshape(-1,1))



## List (Summary)

In [6]:
train_sentences = X_train.tolist()
val_sentences = X_val.tolist()
#test_sentences = X_test.tolist()

# Universal Serial Encoder (Embedding Layer)

In [9]:
#The "Universal Sentence Encoder" (USE) is a model developed by Google that is
#capable of converting text (sentences or phrases) into fixed-size vector representations, often
#referred to as embeddings. These embeddings capture the semantic meaning of the text and can be used
#for a variety of natural language processing (NLP) tasks, such as text similarity, text classification,
#and sentiment analysis.
import tensorflow_hub as hub
embedding_layer = hub.KerasLayer(r"C:\Users\ok\Desktop\jupyter nb files\universal-sentence-encoder_4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

# Creating Dataset

In [10]:
from tensorflow.data import Dataset as tfd
import tensorflow as tf

AUTOTUNE = tf.data.AUTOTUNE

train_dataset = tfd.from_tensor_slices((X_train, train_ohe)).batch(32).prefetch(AUTOTUNE)
val_dataset = tfd.from_tensor_slices((X_val, val_ohe)).batch(32).prefetch(AUTOTUNE)
#test_dataset = tfd.from_tensor_slices((X_test, test_ohe)).batch(32).prefetch(AUTOTUNE)

train_dataset, val_dataset, #test_dataset

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 27), dtype=tf.float64, name=None))>,
 <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 27), dtype=tf.float64, name=None))>)

# Model

In [11]:
classes = len(train_shuffled["genre"].value_counts())

In [12]:
# Build the model
from tensorflow.keras import layers

inputs = layers.Input(shape=[], dtype="string")
x = embedding_layer(inputs)
x = layers.Dense(512, activation="relu")(x)
outputs = layers.Dense(classes, activation="softmax")(x)

model = tf.keras.Model(inputs, outputs)

In [13]:
# Compile the model
model.compile(loss="categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [14]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 universal_sentence_encoder   (None, 512)              256797824 
 (KerasLayer)                                                    
                                                                 
 dense (Dense)               (None, 512)               262656    
                                                                 
 dense_1 (Dense)             (None, 27)                13851     
                                                                 
Total params: 257,074,331
Trainable params: 276,507
Non-trainable params: 256,797,824
_________________________________________________________________


In [15]:
history = model.fit(train_dataset,
                  steps_per_epoch=int(0.1*len(train_dataset)),
                  epochs=10,
                  validation_data=val_dataset,
                  validation_steps=int(0.1*len(val_dataset)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
