In [1]:
import re
import time
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from scipy.spatial.distance import cdist

In [2]:
movies_df = pd.read_json("trek.json")

In [3]:
movies_df.head()

Unnamed: 0,DS9,TOS,TAS,TNG,VOY,ENT
episode 0,\n\n\n\n\n\nThe Deep Space Nine Transcripts - ...,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...,\n\n\n\n\n\nThe Animated Star Trek Transcripts...,\n\n\n\n\n\nThe Next Generation Transcripts - ...,\n\n\n\n\nThe Voyager Transcripts - Caretaker\...,\n\n\n\n\n\nThe Enterprise Transcripts - Broke...
episode 1,\n\n\n\n\n\nThe Deep Space Nine Transcripts - ...,\n\n\n\n\n\nThe Star Trek Transcripts - The Ma...,\n\n\n\n\n\nThe Animated Star Trek Transcripts...,\n\n\n\n\n\nThe Next Generation Transcripts - ...,\n\n\n\n\n\nThe Voyager Transcripts - Parallax...,\n\n\n\n\n\nThe Enterprise Transcripts - Fight...
episode 2,\n\n\n\n\n\nThe Deep Space Nine Transcripts - ...,\n\n\n\n\n\nThe Star Trek Transcripts - Charli...,\n\n\n\n\n\nThe Animated Star Trek Transcripts...,\n\n\n\n\n\nThe Next Generation Transcripts - ...,\n\n\n\n\n\nThe Voyager Transcripts - Time and...,\n\n\n\n\n\nThe Enterprise Transcripts - Stran...
episode 3,\n\n\n\n\n\nThe Deep Space Nine Transcripts - ...,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...,\n\n\n\n\n\nThe Animated Star Trek Transcripts...,\n\n\n\n\n\nThe Next Generation Transcripts - ...,\n\n\n\n\n\n\nThe Voyager Transcripts - Phage\...,\n\n\n\n\n\nThe Enterprise Transcripts - Unexp...
episode 4,\n\n\n\n\n\nThe Deep Space Nine Transcripts - ...,\n\n\n\n\n\nThe Star Trek Transcripts - The Na...,\n\n\n\n\n\nThe Animated Star Trek Transcripts...,\n\n\n\n\n\nThe Next Generation Transcripts - ...,\n\n\n\n\n\nThe Voyager Transcripts - The Clou...,\n\n\n\n\n\nThe Enterprise Transcripts - Terra...


In [4]:
# tdf = movies_df.apply(lambda text: "obrien" if text.values=="O'Brien" else text)
for index, row in movies_df.iterrows():
    for record in row.index:
        if not pd.isna(row[record]):
            if "O'BRIEN" in row[record]:
                row[record] = row[record].replace("O'BRIEN", "obrien")
            elif "O'Brien" in row[record]:    
                row[record] = row[record].replace("O'Brien", "obrien")
            elif "T'Pol" in row[record]:
                row[record] = row[record].replace("T'Pol", "tpol")
            elif  "T'POL" in row[record]:
                row[record] = row[record].replace("T'POL", "tpol")

In [7]:
movies_df["DS9"].str.contains("O'Brien")
# tdf["DS9"].str.contains("OBrien")

episode 0      True
episode 1      True
episode 2      True
episode 3      True
episode 4      True
               ... 
episode 171    True
episode 172    True
episode 173     NaN
episode 174     NaN
episode 175     NaN
Name: DS9, Length: 176, dtype: object

In [8]:
characters_df = pd.read_csv("characters.csv")

In [9]:
characters_df.head()

Unnamed: 0,Character,Series,Roles
0,archer,ENT,Captains
1,kirk,TOS,Captains
2,picard,TNG,Captains
3,sisko,DS9,Captains
4,janeway,VOY,Captains


Task 2

Proposed preprocessing:

1. Keep alphabets, as other characters are not generating topic but might invoke the model to generate if abundant

2. Remove stop words as our task is mainly: finding relationships, so, even though numbers might have impact on characters but the stopwords in my judgement have to be removed to avoid irrevant relationship building

3. To lowercase/uppercase as Word2Vec are not required here to be kept as mixed case

4. Lemmatization: the core form of words are needed rather than the participle forms to find the relationships in this exercise

In [10]:
def clean_text(sentence: str) -> list:
    """cleaning text by extracting alphabets and
    then splitting into word list for each sentence

    Args:
        sentence (str): sentence string

    Returns:
        list: list of words.
    """
    pattern = re.compile(r"[A-Za-z]+")
    return re.findall(pattern=pattern, string=sentence)

def to_lowercase(word_list: list) -> list:
    """case changing of all contents in each list
    of words

    Args:
        word_list (list): list of words with alphabets only texts

    Returns:
        word_list (list): list of words with lowercase transformation
    """
    return [word.lower() for word in word_list]

def remove_stopwords(list_of_words: list) -> list:
    """removing stop words from list of words by matching
    English stop words and extracting those out 
    
    Args:
        list_of_words (list): list of words with stop words

    Returns:
        list: stop word free list of words
    """
    stopword_list = stopwords.words("english")

    return [word for word in list_of_words if word not in stopword_list]

def extract_word_lemma(word_list: list) -> list:
    """lemmatization of words list from word of list
    using NLTK WordNetLemmatizer class

    Args:
        word_list (list): list of string

    Returns:
        list: lemmatized list of strings
    """
    lemmatizer = WordNetLemmatizer()
    return [
        lemmatizer.lemmatize(word, tag[0].lower())
        if tag[0].lower() in ["n", "v", "a", "r", "s"]
        else word
        for word, tag in pos_tag(word_list)
    ]

In [11]:
for col in movies_df.columns:
    #cleaning text
    # checks if not nan then cleans the text otherwise empty string
    movies_df[f"{col}_cleaned"] = movies_df[col].apply(lambda text: clean_text(text) if not pd.isna(text) else "")
    #removing stopwords
    movies_df[f"{col}_cleaned"] = movies_df[f"{col}_cleaned"].apply(lambda word_list: remove_stopwords(word_list))
    #to lowercase
    movies_df[f"{col}_cleaned"] = movies_df[f"{col}_cleaned"].apply(lambda word_list: to_lowercase(word_list))
    #lemmatize words
    movies_df[f"{col}_cleaned"] = movies_df[f"{col}_cleaned"].apply(lambda word_list: extract_word_lemma(word_list))

In [12]:
print(movies_df["DS9"][0][:1000])
print("========================")
print(movies_df["DS9_cleaned"][0][:1000])
# print("========================")
# print(movies_df["TAS"][0][:1000])







The Deep Space Nine Transcripts - Emissary


Emissary
Stardate:
46379.1
Original Airdate: 3 Jan, 1993






  
On Stardate 43997, Captain Jean-Luc Picard of the Federation Starship
Enterprise was kidnapped for six days by an invading force known as the
Borg. Surgically altered, he was forced to lead an assault on Starfleet
at Wolf 359.

[Saratoga - Bridge]

LOCUTUS [on viewscreen]: Resistance is futile. You
will disarm your weapons and escort us to sector zero zero one. If you
attempt to intervene, we will destroy you. 
CAPTAIN: (a Vulcan) Red alert. Load all torpedo bays. Ready phasers.
Move us to position alpha, Ensign. 
(The space battle begins) 
OPS OFFICER: (woman) They've locked on. 
SISKO: Reroute auxiliary power. 
OPS OFFICER: Our shields are being drained. Sixty four percent. Forty
two. 
CAPTAIN: Recalibrate shield nutation. 
TACTICAL: (Bolian) Modulation is having no effect. 
OPS OFFICER: Shields have failed. 
SISKO: Full reverse. 
CAPTAIN: Maintain all Argh! 
(Everythi

In [13]:
movies_df.columns

Index(['DS9', 'TOS', 'TAS', 'TNG', 'VOY', 'ENT', 'DS9_cleaned', 'TOS_cleaned',
       'TAS_cleaned', 'TNG_cleaned', 'VOY_cleaned', 'ENT_cleaned'],
      dtype='object')

In [14]:
corpus = []

In [15]:
for i, item in enumerate(movies_df.items()):
    print(item[0])
    if i>5:
        for val in movies_df[item[0]].values.tolist():
            if len(val)!=0:
                corpus.append(val)

DS9
TOS
TAS
TNG
VOY
ENT
DS9_cleaned
TOS_cleaned
TAS_cleaned
TNG_cleaned
VOY_cleaned
ENT_cleaned


In [16]:
len(corpus)

708

In [17]:
len(corpus[2])

3392

In [18]:
len(movies_df["DS9_cleaned"].values[2])

3392

Task 3

In [19]:
w2v_model_window2 = Word2Vec(
    sentences=corpus,
    window=2,
    vector_size=300,
    workers=6
)

In [20]:
w2v_model_window10 = Word2Vec(
    sentences=corpus,
    window=10,
    vector_size=300,
    workers=6
)

Task 4

Note: I have done for only TOS and TNG

In [21]:
len(characters_df)

20

In [22]:
characters_df

Unnamed: 0,Character,Series,Roles
0,archer,ENT,Captains
1,kirk,TOS,Captains
2,picard,TNG,Captains
3,sisko,DS9,Captains
4,janeway,VOY,Captains
5,tucker,ENT,Engineers
6,scott,TOS,Engineers
7,laforge,TNG,Engineers
8,obrien,DS9,Engineers
9,torres,VOY,Engineers


In [23]:
tos_chars = characters_df[characters_df["Series"]=="TOS"]

In [24]:
tos_chars.head()

Unnamed: 0,Character,Series,Roles
1,kirk,TOS,Captains
6,scott,TOS,Engineers
11,spock,TOS,First Officers
16,scotty,TOS,Nicknames


In [25]:
print(tos_chars["Character"])

1       kirk
6      scott
11     spock
16    scotty
Name: Character, dtype: object


In [26]:
print("[Info:] For Word2Vec model with window size 2")
print("--------------------------------------------------")
print(f"Cosine similarity between {tos_chars['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(tos_chars["Character"].iloc[0], tos_chars["Character"].iloc[1]))
print(w2v_model_window2.wv.similarity(tos_chars["Character"].iloc[0], tos_chars["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(tos_chars["Character"].iloc[0], tos_chars["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {tos_chars['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(tos_chars["Character"].iloc[1], tos_chars["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(tos_chars["Character"].iloc[1], tos_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tos_chars['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(tos_chars["Character"].iloc[2], tos_chars["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 2
--------------------------------------------------
Cosine similarity between kirk vs rest of that movie characters
0.50938386
0.6782039
0.3837251
Cosine similarity between scott vs rest of that movie characters
0.5736383
0.6400283
Cosine similarity between spock vs rest of that movie characters
0.28869802


In [27]:
print("[Info:] For Word2Vec model with window size 10")
print("--------------------------------------------------")
print(f"Cosine similarity between {tos_chars['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(tos_chars["Character"].iloc[0], tos_chars["Character"].iloc[1]))
print(w2v_model_window10.wv.similarity(tos_chars["Character"].iloc[0], tos_chars["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(tos_chars["Character"].iloc[0], tos_chars["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {tos_chars['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(tos_chars["Character"].iloc[1], tos_chars["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(tos_chars["Character"].iloc[1], tos_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tos_chars['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(tos_chars["Character"].iloc[2], tos_chars["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 10
--------------------------------------------------
Cosine similarity between kirk vs rest of that movie characters
0.41160354
0.7633231
0.14840163
Cosine similarity between scott vs rest of that movie characters
0.46575943
0.6496841
Cosine similarity between spock vs rest of that movie characters
0.050443817


In [28]:
tng_chars = characters_df[characters_df["Series"]=="TNG"]

In [29]:
tng_chars

Unnamed: 0,Character,Series,Roles
2,picard,TNG,Captains
7,laforge,TNG,Engineers
12,riker,TNG,First Officers
17,beverly,TNG,Nicknames


In [30]:
print("[Info:] For Word2Vec model with window size 2")
print("--------------------------------------------------")
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tng_chars["Character"].iloc[1]))
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tng_chars["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tng_chars["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[1], tng_chars["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[1], tng_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tng_chars['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[2], tng_chars["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 2
--------------------------------------------------
Cosine similarity between picard vs rest of that movie characters
0.4303182
0.6405068
0.26957816
Cosine similarity between laforge vs rest of that movie characters
0.69371164
0.039259173
Cosine similarity between riker vs rest of that movie characters
0.19341454


In [31]:
print("[Info:] For Word2Vec model with window size 10")
print("--------------------------------------------------")
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tng_chars["Character"].iloc[1]))
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tng_chars["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tng_chars["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[1], tng_chars["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[1], tng_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tng_chars['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[2], tng_chars["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 10
--------------------------------------------------
Cosine similarity between picard vs rest of that movie characters
0.23799735
0.561458
0.10209384
Cosine similarity between laforge vs rest of that movie characters
0.5212987
-0.11195532
Cosine similarity between riker vs rest of that movie characters
0.057595998


In [32]:
print("[Info:] For Word2Vec model with window size 2")
print("--------------------------------------------------")
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[0]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[0]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[1]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[1]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[2]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[2]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[3]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs {tos_chars['Character'].iloc[1]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[1], tos_chars["Character"].iloc[1]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs {tos_chars['Character'].iloc[2]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[1], tos_chars["Character"].iloc[2]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs {tos_chars['Character'].iloc[3]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[1], tos_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tng_chars['Character'].iloc[2]} vs {tos_chars['Character'].iloc[2]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[2], tos_chars["Character"].iloc[2]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[2]} vs {tos_chars['Character'].iloc[3]}")
print(w2v_model_window2.wv.similarity(tng_chars["Character"].iloc[2], tos_chars["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 2
--------------------------------------------------
Cosine similarity between picard vs kirk
0.65013564
Cosine similarity between picard vs scott
0.49607232
Cosine similarity between picard vs spock
0.5578343
Cosine similarity between picard vs scotty
0.36224183
Cosine similarity between laforge vs scott
0.5876373
Cosine similarity between laforge vs spock
0.250213
Cosine similarity between laforge vs scotty
0.40245247
Cosine similarity between riker vs spock
0.37435502
Cosine similarity between riker vs scotty
0.44006148


In [33]:
print("[Info:] For Word2Vec model with window size 10")
print("--------------------------------------------------")
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[0]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[0]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[1]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[1]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[2]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[2]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[0]} vs {tos_chars['Character'].iloc[3]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[0], tos_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs {tos_chars['Character'].iloc[1]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[1], tos_chars["Character"].iloc[1]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs {tos_chars['Character'].iloc[2]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[1], tos_chars["Character"].iloc[2]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[1]} vs {tos_chars['Character'].iloc[3]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[1], tos_chars["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {tng_chars['Character'].iloc[2]} vs {tos_chars['Character'].iloc[2]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[2], tos_chars["Character"].iloc[2]))
print(f"Cosine similarity between {tng_chars['Character'].iloc[2]} vs {tos_chars['Character'].iloc[3]}")
print(w2v_model_window10.wv.similarity(tng_chars["Character"].iloc[2], tos_chars["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 10
--------------------------------------------------
Cosine similarity between picard vs kirk
0.13474989
Cosine similarity between picard vs scott
0.16353893
Cosine similarity between picard vs spock
0.19993867
Cosine similarity between picard vs scotty
-0.04615826
Cosine similarity between laforge vs scott
0.4760766
Cosine similarity between laforge vs spock
0.067430146
Cosine similarity between laforge vs scotty
0.3182582
Cosine similarity between riker vs spock
0.0054658335
Cosine similarity between riker vs scotty
0.11135006


Noticing the model with window 10 can differentiates better between the characters of different shows.

Task 5

Note: I have done for only TNG and TOS

In [34]:
characters_df

Unnamed: 0,Character,Series,Roles
0,archer,ENT,Captains
1,kirk,TOS,Captains
2,picard,TNG,Captains
3,sisko,DS9,Captains
4,janeway,VOY,Captains
5,tucker,ENT,Engineers
6,scott,TOS,Engineers
7,laforge,TNG,Engineers
8,obrien,DS9,Engineers
9,torres,VOY,Engineers


In [35]:
captain_roles = characters_df[characters_df["Roles"]=="Captains"]
engineers_roles = characters_df[characters_df["Roles"]=="Engineers"]
first_officers_roles = characters_df[characters_df["Roles"]=="First Officers"]
nickname_roles = characters_df[characters_df["Roles"]=="Nicknames"]

In [38]:
engineers_roles

Unnamed: 0,Character,Series,Roles
5,tucker,ENT,Engineers
6,scott,TOS,Engineers
7,laforge,TNG,Engineers
8,obrien,DS9,Engineers
9,torres,VOY,Engineers


In [39]:
print("[Info:] For Word2Vec model with window size 2")
print("--------------------------------------------------")
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], engineers_roles["Character"].iloc[1]))
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], engineers_roles["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], engineers_roles["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[1], engineers_roles["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[1], engineers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {engineers_roles['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[2], engineers_roles["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 2
--------------------------------------------------
Cosine similarity between tucker vs rest of that movie characters
0.7095746
0.7186027
0.68705827
Cosine similarity between scott vs rest of that movie characters
0.5876373
0.6107189
Cosine similarity between laforge vs rest of that movie characters
0.72461724


In [40]:
print("[Info:] For Word2Vec model with window size 10")
print("--------------------------------------------------")
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], engineers_roles["Character"].iloc[1]))
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], engineers_roles["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], engineers_roles["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[1], engineers_roles["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[1], engineers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {engineers_roles['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[2], engineers_roles["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 10
--------------------------------------------------
Cosine similarity between tucker vs rest of that movie characters
0.5461764
0.5399655
0.40896097
Cosine similarity between scott vs rest of that movie characters
0.4760766
0.39526504
Cosine similarity between laforge vs rest of that movie characters
0.4667212


In [42]:
first_officers_roles

Unnamed: 0,Character,Series,Roles
10,tpol,ENT,First Officers
11,spock,TOS,First Officers
12,riker,TNG,First Officers
13,kira,DS9,First Officers
14,chakotay,VOY,First Officers


In [43]:
print("[Info:] For Word2Vec model with window size 2")
print("--------------------------------------------------")
print(f"Cosine similarity between {first_officers_roles['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(first_officers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[1]))
print(w2v_model_window2.wv.similarity(first_officers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(first_officers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {first_officers_roles['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(first_officers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[2]))
print(w2v_model_window2.wv.similarity(first_officers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {first_officers_roles['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window2.wv.similarity(first_officers_roles["Character"].iloc[2], first_officers_roles["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 2
--------------------------------------------------
Cosine similarity between tpol vs rest of that movie characters
0.41013595
0.50796086
0.35607538
Cosine similarity between spock vs rest of that movie characters
0.37435502
0.19151393
Cosine similarity between riker vs rest of that movie characters
0.6203386


In [44]:
print("[Info:] For Word2Vec model with window size 10")
print("--------------------------------------------------")
print(f"Cosine similarity between {first_officers_roles['Character'].iloc[0]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(first_officers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[1]))
print(w2v_model_window10.wv.similarity(first_officers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(first_officers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[3]))
print("==================================================")


print(f"Cosine similarity between {first_officers_roles['Character'].iloc[1]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(first_officers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[2]))
print(w2v_model_window10.wv.similarity(first_officers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {first_officers_roles['Character'].iloc[2]} vs rest of that movie characters")
print(w2v_model_window10.wv.similarity(first_officers_roles["Character"].iloc[2], first_officers_roles["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 10
--------------------------------------------------
Cosine similarity between tpol vs rest of that movie characters
0.31427726
0.29237792
0.13786831
Cosine similarity between spock vs rest of that movie characters
0.0054658335
-0.080437295
Cosine similarity between riker vs rest of that movie characters
0.38715225


In [45]:
print("[Info:] For Word2Vec model with window size 2")
print("--------------------------------------------------")
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[0]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[0]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[1]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[1]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[2]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[2]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[3]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs {first_officers_roles['Character'].iloc[1]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[1]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs {first_officers_roles['Character'].iloc[2]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[2]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs {first_officers_roles['Character'].iloc[3]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {engineers_roles['Character'].iloc[2]} vs {first_officers_roles['Character'].iloc[2]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[2], first_officers_roles["Character"].iloc[2]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[2]} vs {first_officers_roles['Character'].iloc[3]}")
print(w2v_model_window2.wv.similarity(engineers_roles["Character"].iloc[2], first_officers_roles["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 2
--------------------------------------------------
Cosine similarity between tucker vs tpol
0.42979425
Cosine similarity between tucker vs spock
0.39110786
Cosine similarity between tucker vs riker
0.7043391
Cosine similarity between tucker vs kira
0.5137018
Cosine similarity between scott vs spock
0.5736383
Cosine similarity between scott vs riker
0.587744
Cosine similarity between scott vs kira
0.36684015
Cosine similarity between laforge vs riker
0.69371164
Cosine similarity between laforge vs kira
0.51419044


In [46]:
print("[Info:] For Word2Vec model with window size 10")
print("--------------------------------------------------")
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[0]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[0]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[1]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[1]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[2]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[2]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[0]} vs {first_officers_roles['Character'].iloc[3]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[0], first_officers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs {first_officers_roles['Character'].iloc[1]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[1]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs {first_officers_roles['Character'].iloc[2]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[2]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[1]} vs {first_officers_roles['Character'].iloc[3]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[1], first_officers_roles["Character"].iloc[3]))
print("==================================================")

print(f"Cosine similarity between {engineers_roles['Character'].iloc[2]} vs {first_officers_roles['Character'].iloc[2]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[2], first_officers_roles["Character"].iloc[2]))
print(f"Cosine similarity between {engineers_roles['Character'].iloc[2]} vs {first_officers_roles['Character'].iloc[3]}")
print(w2v_model_window10.wv.similarity(engineers_roles["Character"].iloc[2], first_officers_roles["Character"].iloc[3]))
print("==================================================")

[Info:] For Word2Vec model with window size 10
--------------------------------------------------
Cosine similarity between tucker vs tpol
0.5403468
Cosine similarity between tucker vs spock
0.18120287
Cosine similarity between tucker vs riker
0.34825748
Cosine similarity between tucker vs kira
0.11186403
Cosine similarity between scott vs spock
0.46575943
Cosine similarity between scott vs riker
0.25181574
Cosine similarity between scott vs kira
0.021478366
Cosine similarity between laforge vs riker
0.5212987
Cosine similarity between laforge vs kira
0.24677311


Noticing the model with window 10 can differentiates better between the characters of different roles.