In [7]:
import pandas as pd
import joblib
import random
import json
import sklearn

In [8]:
# defining function to read files and store as nested lists

def read_bio(filepath):
    f = open(filepath, 'r')
    data = pd.DataFrame(f, columns=["line"])
    f.close()

    breaks = [0]
    breaks.extend(list(data[data.line=="\n"].index))
    labels = {}
    tokens = {}
    for i in range(len(breaks)-1):
        # print(i)
        labels[i] = []
        tokens[i] = []
        for j in range(breaks[i]+1, breaks[i+1]):
            line = data.loc[j, "line"]
            tokens[i].append(line.split("\t")[1][:-1])
            labels[i].append(line.split("\t")[0])

    return (list(tokens.values()), list(labels.values()))

In [9]:
X, y = read_bio("../datasets/movies/MIT_movie_NER/original_files/engtrain.bio")
X_test, y_test = read_bio("../datasets/movies/MIT_movie_NER/original_files/engtest.bio")
trivia_X, trivia_y = read_bio("../datasets/movies/MIT_movie_NER/original_files/trivia10k13train.bio")
trivia_X_test, trivia_y_test = read_bio("../datasets/movies/MIT_movie_NER/original_files/trivia10k13test.bio")

In [10]:
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)
X_test, y_test = shuffle(X_test, y_test, random_state=0)
trivia_X, trivia_y = shuffle(trivia_X, trivia_y, random_state=0)
trivia_X_test, trivia_y_test = shuffle(trivia_X_test, trivia_y_test, random_state=0)

val_cutoff = round(0.2*len(X))
X_val = X[:val_cutoff]
X_train = X[val_cutoff:]
y_val = y[:val_cutoff]
y_train = y[val_cutoff:]

# random.shuffle(trivia_X)

trivia_val_cutoff = round(0.2*len(trivia_X))
trivia_X_val = trivia_X[:trivia_val_cutoff]
trivia_X_train = trivia_X[trivia_val_cutoff:]
trivia_y_val = trivia_y[:trivia_val_cutoff]
trivia_y_train = trivia_y[trivia_val_cutoff:]

In [13]:
joblib.dump(X_train, "../datasets/movies/MIT_movie_NER/nested_lists/X_train")
joblib.dump(y_train, "../datasets/movies/MIT_movie_NER/nested_lists/y_train")
joblib.dump(X_test, "../datasets/movies/MIT_movie_NER/nested_lists/X_test")
joblib.dump(y_test, "../datasets/movies/MIT_movie_NER/nested_lists/y_test")
joblib.dump(X_val, "../datasets/movies/MIT_movie_NER/nested_lists/X_val")
joblib.dump(y_val, "../datasets/movies/MIT_movie_NER/nested_lists/y_val")
joblib.dump(trivia_X_train, "../datasets/movies/MIT_movie_NER/nested_lists/trivia_X_train")
joblib.dump(trivia_y_train, "../datasets/movies/MIT_movie_NER/nested_lists/trivia_y_train")
joblib.dump(trivia_X_test, "../datasets/movies/MIT_movie_NER/nested_lists/trivia_X_test")
joblib.dump(trivia_y_test, "../datasets/movies/MIT_movie_NER/nested_lists/trivia_y_test")
joblib.dump(trivia_X_val, "../datasets/movies/MIT_movie_NER/nested_lists/trivia_X_val")
joblib.dump(trivia_y_val, "../datasets/movies/MIT_movie_NER/nested_lists/trivia_y_val")

['../datasets/movies/MIT_movie_NER/nested_lists/trivia_y_val']

In [14]:
training = {}
validation = {}
testing = {}
trivia_training = {}
trivia_validation = {}
trivia_testing = {}

trivia_training['sentences'] = trivia_X_train
trivia_training['tags'] = trivia_y_train
trivia_validation['sentences'] = trivia_X_val
trivia_validation['tags'] = trivia_y_val
trivia_testing['sentences'] = trivia_X_test
trivia_testing['tags'] = trivia_y_test

training['sentences'] = X_train
training['tags'] = y_train
validation['sentences'] = X_val
validation['tags'] = y_val
testing['sentences'] = X_test
testing['tags'] = y_test


joblib.dump(trivia_training, "../datasets/movies/MIT_movie_NER/dict_structure/trivia_training.dict")
joblib.dump(trivia_validation, "../datasets/movies/MIT_movie_NER/dict_structure/trivia_validation.dict")
joblib.dump(trivia_testing, "../datasets/movies/MIT_movie_NER/dict_structure/trivia_testing.dict")
joblib.dump(training, "../datasets/movies/MIT_movie_NER/dict_structure/training.dict")
joblib.dump(validation, "../datasets/movies/MIT_movie_NER/dict_structure/validation.dict")
joblib.dump(testing, "../datasets/movies/MIT_movie_NER/dict_structure/testing.dict")

['../datasets/movies/MIT_movie_NER/dict_structure/testing.dict']

## Checking the different possible BIO tags we have

In [17]:
# Normal NER

train_tag_scheme = []
for each_list in y_train:
    for each in set(each_list):
        if each not in train_tag_scheme:
            train_tag_scheme.append(each)

train_tag_scheme.sort()

In [18]:
train_tag_scheme

['B-ACTOR',
 'B-CHARACTER',
 'B-DIRECTOR',
 'B-GENRE',
 'B-PLOT',
 'B-RATING',
 'B-RATINGS_AVERAGE',
 'B-REVIEW',
 'B-SONG',
 'B-TITLE',
 'B-TRAILER',
 'B-YEAR',
 'I-ACTOR',
 'I-CHARACTER',
 'I-DIRECTOR',
 'I-GENRE',
 'I-PLOT',
 'I-RATING',
 'I-RATINGS_AVERAGE',
 'I-REVIEW',
 'I-SONG',
 'I-TITLE',
 'I-TRAILER',
 'I-YEAR',
 'O']

In [19]:
# trivia NER

trivia_tag_scheme = []
for each_list in trivia_y_train:
    for each in set(each_list):
        if each not in trivia_tag_scheme:
            trivia_tag_scheme.append(each)
            
trivia_tag_scheme.sort()

In [20]:
trivia_tag_scheme

['B-Actor',
 'B-Award',
 'B-Character_Name',
 'B-Director',
 'B-Genre',
 'B-Opinion',
 'B-Origin',
 'B-Plot',
 'B-Quote',
 'B-Relationship',
 'B-Soundtrack',
 'B-Year',
 'I-Actor',
 'I-Award',
 'I-Character_Name',
 'I-Director',
 'I-Genre',
 'I-Opinion',
 'I-Origin',
 'I-Plot',
 'I-Quote',
 'I-Relationship',
 'I-Soundtrack',
 'I-Year',
 'O']