In [1]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

# download necessary NLTK packages
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:

# load the dataset
df1 = pd.read_csv("wiki_movie_plots_deduped.csv")
df1

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [3]:
df = pd.DataFrame()
df["text"] = df1["Plot"]

In [4]:
df

Unnamed: 0,text
0,"A bartender is working at a saloon, serving dr..."
1,"The moon, painted with a smiling face hangs ov..."
2,"The film, just over a minute long, is composed..."
3,Lasting just 61 seconds and consisting of two ...
4,The earliest known adaptation of the classic f...
...,...
34881,"The film begins in 1919, just after World War ..."
34882,"Two musicians, Salih and Gürkan, described the..."
34883,"Zafer, a sailor living with his mother Döndü i..."
34884,The film centres around a young woman named Am...


In [5]:
# function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word):
    """
    Map NLTK POS tag to WordNet POS tag
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [6]:

# remove any leading or trailing whitespaces
df["text"] = df["text"].str.strip()

# remove any URL links
df["text"] = df["text"].apply(lambda x: re.sub(r'http\S+', '', x))

# remove any mentions or tags
df["text"] = df["text"].apply(lambda x: re.sub(r'@\S+', '', x))

# remove any hashtags
df["text"] = df["text"].apply(lambda x: re.sub(r'#\S+', '', x))

# remove any non-alphanumeric characters except spaces
df["text"] = df["text"].apply(lambda x: re.sub(r'[^\w\s]+', '', x))

# convert text to lowercase
df["text"] = df["text"].str.lower()

# remove any extra whitespaces
df["text"] = df["text"].apply(lambda x: re.sub(r'\s+', ' ', x))

# tokenize the text
df["tokens"] = df["text"].apply(word_tokenize)

# lemmatize the tokens
lemmatizer = WordNetLemmatizer()
df["lemmatized_tokens"] = df["tokens"].apply(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])

# stem the lemmatized tokens
stemmer = SnowballStemmer("english")
df["stemmed_tokens"] = df["lemmatized_tokens"].apply(lambda x: [stemmer.stem(word) for word in x])

# join the stemmed tokens back into text
df["preprocessed_text"] = df["stemmed_tokens"].apply(lambda x: ' '.join(x))

# save the preprocessed dataset
df.to_csv("preprocessed_text.csv", index=False)





In [13]:
df

0        a bartend be work at a saloon serv drink to cu...
1        the moon paint with a smile face hang over a p...
2        the film just over a minut long be compos of t...
3        last just 61 second and consist of two shot th...
4        the earli know adapt of the classic fairytal t...
                               ...                        
34881    the film begin in 1919 just after world war i ...
34882    two musician salih and gürkan describ the adve...
34883    zafer a sailor live with his mother döndü in a...
34884    the film centr around a young woman name ami t...
34885    the writer orhan şahin return to i̇stanbul aft...
Name: preprocessed_text, Length: 34886, dtype: object

In [15]:
df[0]

'a bartend be work at a saloon serv drink to custom after he fill a stereotyp irish man bucket with beer carri nation and her follow burst insid they assault the irish man pull his hat over his eye and then dump the beer over his head the group then begin wreck the bar smash the fixtur mirror and break the cash regist the bartend then spray seltzer water in nation face befor a group of policeman appear and order everybodi to leave1'

In [13]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [14]:
# load the preprocessed dataset
df = pd.read_csv("Preprocessed.csv")
df

Unnamed: 0,text,tokens,lemmatized_tokens,stemmed_tokens,preprocessed_text
0,after world war ii exserviceman joe barrett hu...,"['after', 'world', 'war', 'ii', 'exserviceman'...","['after', 'world', 'war', 'ii', 'exserviceman'...","['after', 'world', 'war', 'ii', 'exserviceman'...",after world war ii exserviceman joe barrett hu...
1,jane and alan palmer scott and kennedy are dri...,"['jane', 'and', 'alan', 'palmer', 'scott', 'an...","['jane', 'and', 'alan', 'palmer', 'scott', 'an...","['jane', 'and', 'alan', 'palmer', 'scott', 'an...",jane and alan palmer scott and kennedi be driv...
2,the film tells the story of the us treasury de...,"['the', 'film', 'tells', 'the', 'story', 'of',...","['the', 'film', 'tell', 'the', 'story', 'of', ...","['the', 'film', 'tell', 'the', 'stori', 'of', ...",the film tell the stori of the u treasuri depa...
3,the plot revolved around the tulsa oklahoma oi...,"['the', 'plot', 'revolved', 'around', 'the', '...","['the', 'plot', 'revolve', 'around', 'the', 't...","['the', 'plot', 'revolv', 'around', 'the', 'tu...",the plot revolv around the tulsa oklahoma oil ...
4,in 1949 former us army air forces officer harv...,"['in', '1949', 'former', 'us', 'army', 'air', ...","['in', '1949', 'former', 'u', 'army', 'air', '...","['in', '1949', 'former', 'u', 'armi', 'air', '...",in 1949 former u armi air forc offic harvey st...
5,frank warren is a treasury agent assigned to p...,"['frank', 'warren', 'is', 'a', 'treasury', 'ag...","['frank', 'warren', 'be', 'a', 'treasury', 'ag...","['frank', 'warren', 'be', 'a', 'treasuri', 'ag...",frank warren be a treasuri agent assign to put...
6,tony reagan scott brady a former chicago mobst...,"['tony', 'reagan', 'scott', 'brady', 'a', 'for...","['tony', 'reagan', 'scott', 'brady', 'a', 'for...","['toni', 'reagan', 'scott', 'bradi', 'a', 'for...",toni reagan scott bradi a former chicago mobst...
7,one day in contemporary mexicali a poker game ...,"['one', 'day', 'in', 'contemporary', 'mexicali...","['one', 'day', 'in', 'contemporary', 'mexicali...","['one', 'day', 'in', 'contemporari', 'mexicali...",one day in contemporari mexicali a poker game ...
8,the story draws on events that occurred as par...,"['the', 'story', 'draws', 'on', 'events', 'tha...","['the', 'story', 'draw', 'on', 'event', 'that'...","['the', 'stori', 'draw', 'on', 'event', 'that'...",the stori draw on event that occur a part of t...
9,bugs bunny is exploring dark africa a short wi...,"['bugs', 'bunny', 'is', 'exploring', 'dark', '...","['bug', 'bunny', 'be', 'explore', 'dark', 'afr...","['bug', 'bunni', 'be', 'explor', 'dark', 'afri...",bug bunni be explor dark africa a short witch ...


In [15]:

# load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:

# define device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# move the model to the device
model = model.to(device)

# set the model to evaluation mode
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:

# define function to encode text with BERT tokenizer and convert to tensor
# define function to encode text with BERT tokenizer and convert to tensor
def tokenize_and_convert_to_tensor(text):
    # split text sequence into smaller sub-sequences with maximum length of 512 tokens
    sub_texts = [text[i:i+512] for i in range(0, len(text), 512)]
    
    # encode each sub-sequence separately
    encoded_tensors = []
    for sub_text in sub_texts:
        encoded = tokenizer.encode_plus(sub_text, add_special_tokens=True, max_length=512, padding='max_length', return_attention_mask=True, return_tensors='pt')
        encoded_tensors.append(encoded)
    
    # concatenate the encoded tensors along the batch dimension
    input_ids = torch.cat([t['input_ids'] for t in encoded_tensors], dim=0).to(device)
    attention_mask = torch.cat([t['attention_mask'] for t in encoded_tensors], dim=0).to(device)
    
    return input_ids, attention_mask


# define function to make predictions with BERT model
def predict_with_model(text):
    input_ids, attention_mask = tokenize_and_convert_to_tensor(text)
    with torch.no_grad():
        output = model(input_ids, attention_mask)
    return output[0][0][1].item()


In [18]:

# create new column in dataset with anomaly scores
df['anomaly_score'] = df['preprocessed_text'].apply(predict_with_model)


In [19]:

# define function to scale anomaly scores to range [0, 1]
def scale_anomaly_score(score):
    return (score - df['anomaly_score'].min()) / (df['anomaly_score'].max() - df['anomaly_score'].min())

# create new column in dataset with scaled anomaly scores
df['scaled_anomaly_score'] = df['anomaly_score'].apply(scale_anomaly_score)

# sort dataset by scaled anomaly scores in descending order
df = df.sort_values('scaled_anomaly_score', ascending=False)

# print the top 10 anomalies
print(df.head(10))


                                                 text  \
32  the mine owner of the el coronado mime is ambu...   
14  in a story told in a series of flashbacks sing...   
0   after world war ii exserviceman joe barrett hu...   
5   frank warren is a treasury agent assigned to p...   
33  childhood friends rocky barnes stevens and dan...   
13  a housewife day is confronted during her daily...   
31  the notorious attempt by swindler james reavis...   
28  in 17thcentury colonial argentina while trying...   
15  after the war granger college has a lot of stu...   
10  ann sutton gene tierney the wife of a successf...   

                                               tokens  \
32  ['the', 'mine', 'owner', 'of', 'the', 'el', 'c...   
14  ['in', 'a', 'story', 'told', 'in', 'a', 'serie...   
0   ['after', 'world', 'war', 'ii', 'exserviceman'...   
5   ['frank', 'warren', 'is', 'a', 'treasury', 'ag...   
33  ['childhood', 'friends', 'rocky', 'barnes', 's...   
13  ['a', 'housewife', 'day', 