In [1]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tensorflow
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [None]:
# Read training data - contains 54200 entries
df = pd.read_csv('/content/drive/MyDrive/data/train_data.txt',sep=":::", names = ["Title", "Genre", "Description"],engine="python")
df.head(20)

In [None]:
# Reduce the size of the training dataset
df_1 = df.iloc[0:50000, :]
df_2 = df.iloc[0:30000, :]
df_3 = df.iloc[0:10000, :]

In [None]:
print(len(df_1.Genre.unique()))
print(len(df_2.Genre.unique()))
print(len(df_3.Genre.unique()))

27
27
27


In [None]:
# Pre-processing
replace_symbols = re.compile('[/(){}\[\]\|@,\_\-;]')
# The following regex will matcgh with anything that is not a number or a letter
remove_symbols = re.compile('[^0-9a-z ]') 
stopword = set(stopwords.words('english'))

def clean_text(input):

  # lowercase text
  input = input.lower() 
  # replace certain symbols by space in text
  input = replace_symbols.sub(' ', input) 
  # remove all non alphanumeric characters
  input = remove_symbols.sub('', input) 
  # remove stopwords
  input = ' '.join(word for word in input.split() if word not in stopword) 
  return input


df_1['Description'] = df_1['Description'].apply(clean_text)
df_1['Description'] = df_1['Description'].str.replace('\d+', '')

df_2['Description'] = df_2['Description'].apply(clean_text)
df_2['Description'] = df_2['Description'].str.replace('\d+', '')

df_3['Description'] = df_3['Description'].apply(clean_text)
df_3['Description'] = df_3['Description'].str.replace('\d+', '')


In [None]:
import csv
csv.QUOTE_NONE

3

In [2]:
from tensorflow import keras
import numpy as np

In [3]:
max_length = 500
embedding_dim = 128
max_words = 50000

In [4]:
# reading the test data
df_test = pd.read_csv('/content/drive/MyDrive/data/test_data.txt',sep=":::", names = ["Title", "Genre", "Description"],engine="python")
df_test['Description'] = df_test['Description'].apply(clean_text)
df_test['Description'] = df_test['Description'].str.replace('\d+', '')

# extracting the ground values
ground_values = df_test['Genre'].values.tolist()
for i in range(len(ground_values)):
  ground_values[i] = ground_values[i].replace(" ","")

NameError: ignored

In [5]:
# Tokenize the description
def preprocessing(df_test, df_train):
  tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
  tokenizer.fit_on_texts(df_train['Description'].values)
  X = tokenizer.texts_to_sequences(df_train['Description'].values)
  X = keras.utils.pad_sequences(X, maxlen=max_length)

  Y = pd.get_dummies(df_train['Genre'],columns='Genre')

  X_test= tokenizer.texts_to_sequences(df_test['Description'].values)
  X_test = keras.utils.pad_sequences(X_test, maxlen=max_length)

  return X, Y, X_test

In [None]:
!pip install keras
!pip install scikit-metrics

from keras import models,layers
from keras.models import Sequential
from keras.layers import LSTM, Embedding, SpatialDropout1D
from keras.layers.core import Dense, Activation, Dropout

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [13]:
def classify(X,Y,X_test,epochs,ground_values):
  model = Sequential()
  model.add(Embedding(max_words, embedding_dim, input_length=X.shape[1]))
  model.add(SpatialDropout1D(0.2))
  model.add(LSTM(64, dropout=0.2, recurrent_dropout=0))
  model.add(Dense(27, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  print(model.summary())
  #history = model.fit(X, Y, epochs=epochs,validation_split=0.1,callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
  history = model.fit(X, Y, epochs=epochs,validation_split=0.1)


  predict = model.predict(X_test,use_multiprocessing=True)
  results = []
  labels = ['action', 'adult', 'adventure', 'animation', 'biography',
        'comedy', 'crime', 'documentary', 'drama', 'family',
        'fantasy', 'game-show', 'history', 'horror', 'music',
        'musical', 'mystery', 'news', 'reality-tv', 'romance',
        'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war',
        'western']
  for element in predict:
    pred_class = labels[np.argmax(element)]
    results.append(pred_class)
  print("exited loop")
  f1 = f1_score(ground_values,results,average='macro')

  return f1


In [8]:
def output(df_train, df_test,epochs,ground_values):
  data = preprocessing(df_test, df_train)
  X = data[0]
  Y = data[1]
  X_test = data[2]
  f1 = classify(X,Y,X_test,epochs,ground_values)
  return f1

In [None]:
# F1 scores (with pre-processing) when dataset size is set to 50 000
print(output(df_1,df_test,1,ground_values))
print(output(df_1,df_test,5,ground_values))
print(output(df_1,df_test,10,ground_values))

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 500, 128)          6400000   
                                                                 
 spatial_dropout1d_11 (Spati  (None, 500, 128)         0         
 alDropout1D)                                                    
                                                                 
 lstm_11 (LSTM)              (None, 64)                49408     
                                                                 
 dense_11 (Dense)            (None, 27)                1755      
                                                                 
Total params: 6,451,163
Trainable params: 6,451,163
Non-trainable params: 0
_________________________________________________________________
None
exited loop
0.11453011628359272
Model: "sequential_12"
_________________________________________________

In [None]:
# F1 scores (with pre-processing) when dataset size is set to 30 000
print(output(df_2,df_test,1,ground_values))
print(output(df_2,df_test,5,ground_values))
print(output(df_2,df_test,10,ground_values))

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 500, 128)          6400000   
                                                                 
 spatial_dropout1d_14 (Spati  (None, 500, 128)         0         
 alDropout1D)                                                    
                                                                 
 lstm_14 (LSTM)              (None, 64)                49408     
                                                                 
 dense_14 (Dense)            (None, 27)                1755      
                                                                 
Total params: 6,451,163
Trainable params: 6,451,163
Non-trainable params: 0
_________________________________________________________________
None
exited loop
0.07417895934032129
Model: "sequential_15"
_________________________________________________

In [None]:
# F1 scores (with pre-processing) when dataset size is set to 10 000
print(output(df_3,df_test,1,ground_values))
print(output(df_3,df_test,5,ground_values))  
print(output(df_3,df_test,10,ground_values))

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 500, 128)          6400000   
                                                                 
 spatial_dropout1d_17 (Spati  (None, 500, 128)         0         
 alDropout1D)                                                    
                                                                 
 lstm_17 (LSTM)              (None, 64)                49408     
                                                                 
 dense_17 (Dense)            (None, 27)                1755      
                                                                 
Total params: 6,451,163
Trainable params: 6,451,163
Non-trainable params: 0
_________________________________________________________________
None
exited loop
0.041195502818745365
Model: "sequential_18"
________________________________________________

In [9]:
# No pre-processing is done here.
df = pd.read_csv('/content/drive/MyDrive/data/train_data.txt',sep=":::", names = ["Title", "Genre", "Description"],engine="python")
df_1 = df.iloc[0:50000, :]
df_2 = df.iloc[0:30000, :]
df_3 = df.iloc[0:10000, :]
df_test = pd.read_csv('/content/drive/MyDrive/data/test_data.txt',sep=":::", names = ["Title", "Genre", "Description"],engine="python")# extracting the ground values
ground_values = df_test['Genre'].values.tolist()
for i in range(len(ground_values)):
  ground_values[i] = ground_values[i].replace(" ","")

In [10]:
# F1 scores (without pre-processing) when dataset size is set to 50 000
print(output(df_1,df_test,1,ground_values))
print(output(df_1,df_test,5,ground_values))
print(output(df_1,df_test,10,ground_values))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 128)          6400000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 500, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 27)                1755      
                                                                 
Total params: 6,451,163
Trainable params: 6,451,163
Non-trainable params: 0
_________________________________________________________________
None
exited loop
0.059482701017214014
Model: "sequential_1"
____________________________________________________

In [11]:
# F1 scores (without pre-processing) when dataset size is set to 30 000
print(output(df_2,df_test,1,ground_values))
print(output(df_2,df_test,5,ground_values))
print(output(df_2,df_test,10,ground_values))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 128)          6400000   
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 500, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 27)                1755      
                                                                 
Total params: 6,451,163
Trainable params: 6,451,163
Non-trainable params: 0
_________________________________________________________________
None
exited loop
0.06012759266446902
Model: "sequential_4"
___________________________________________________

In [15]:
# F1 scores (without pre-processing) when dataset size is set to 10 000
print(output(df_3,df_test,1,ground_values))
print(output(df_3,df_test,5,ground_values))
print(output(df_3,df_test,10,ground_values))

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 500, 128)          6400000   
                                                                 
 spatial_dropout1d_12 (Spati  (None, 500, 128)         0         
 alDropout1D)                                                    
                                                                 
 lstm_12 (LSTM)              (None, 64)                49408     
                                                                 
 dense_12 (Dense)            (None, 27)                1755      
                                                                 
Total params: 6,451,163
Trainable params: 6,451,163
Non-trainable params: 0
_________________________________________________________________
None
exited loop
0.03731472389580585
Model: "sequential_13"
_________________________________________________