## Import Packages

In [695]:
import gensim
from gensim.models import Word2Vec, FastText, TfidfModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
import pandas as pd

## Dataset

In [481]:
sequence = ["SNSTRLPEASQAHVRVSPGQAAPAMARPRRSRH", "HSTRQKSPEIHRRIDISPSTLRKHTRLAGEERV", 
            "GRSPSLQPTRTSSESIYSRPGSSIPGSPGHTIY", "NSSSPQSSAGGKPAMSYASALRAPPKPRPPPEQ"]

## Word2Vec Vectorization Technique

In [600]:
def word2vec(data, win=1, count=1):
#     model = gensim.models.Word2Vec(window=win, min_count=count, workers=4)
#     model.build_vocab(data)
#     model.train(data, total_examples=model.corpus_count, epochs=model.epochs)

    model = Word2Vec(data, window=win, min_count=count, workers=4)
    return model

## FastText Vectorization Technique

In [601]:
def fasttext(data, vector=4, win=1, count=1):
#     model = FastText(vector_size=vector, window=win, min_count=count)
#     model.build_vocab(sentences=data)
#     model.train(sentences=data, total_examples=len(data), epochs=10)

    model = FastText(window=win, min_count=count, sentences=data, epochs=10)
    return model

## Doc2Vec Vectorization Technique

In [602]:
def doc2vec(data, vector=4, win=1, count=1):
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]
    model = Doc2Vec(documents, window=win, min_count=1, workers=count)
    return model

## Vectorization the model data

In [547]:
def vectorization(model, column_list=[]):
    df = pd.DataFrame(columns = column_list)
    for i in sorted(column_list):
        row = model.wv.most_similar(i)
        key = [list(j)[0] for j in sorted(row)]
        value = [list(j)[1] for j in sorted(row)]
        df1 = pd.DataFrame([value], columns = key, index=[i])
        df1[i] = 1
        df = pd.concat([df,df1], axis = 'index')
    return df

## n Letter Combination

In [548]:
# words.append([char for word in sequence for char in word])

def combination_sequence(n=1):
    words = []
    columns = []
    for word in sequence:
        words.append([word[i:i+n] for i in range(0, len(word)) if(len(word[i:i+n])>=n)])
        columns+=[word[i:i+n] for i in range(0, len(word)) if(len(word[i:i+n])>=n)]
    
    columns = sorted(list(set(columns)))
    return words, columns

# Main Function

In [555]:
words, column_list = combination_sequence(1)
# print(column_list)

### Word2Vec Implementation

In [603]:
model = word2vec(words, win=3, count=1)
df = vectorization(model, column_list)
df.head()

Unnamed: 0,A,D,E,G,H,I,K,L,M,N,P,Q,R,S,T,V,Y
A,1.0,,0.052147,,0.082688,,0.050407,0.017615,0.17931,,0.076533,,-0.004085,,0.140252,-0.005019,0.009754
D,,1.0,0.170207,0.11165,,0.0114697,,-0.006987,0.050125,0.0656836,0.029257,0.1751,,0.222894,0.052951,,
E,0.0521469,0.170207,1.0,0.0428613,0.022216,0.0140188,,0.060559,,,,,0.160274,0.0302015,0.031749,,0.018562
G,,0.11165,0.042861,1.0,,,0.001165,,,0.0162443,0.012662,,0.177875,-0.0179841,0.145437,0.153015,0.255601
H,0.0826877,,0.022216,,1.0,0.054625,0.048631,0.03764,,-0.0142012,0.046084,,0.211573,0.0391953,0.072455,,


In [604]:
print(model.wv.most_similar('S'))
print("\n\nSimilarities between {} and {}: {}".format('S', 'Y', model.wv.similarity(w1='S', w2='Y')))

[('I', 0.22430738806724548), ('D', 0.2228943109512329), ('V', 0.09849028289318085), ('Q', 0.09352164715528488), ('K', 0.08985263854265213), ('L', 0.07079657912254333), ('N', 0.05472324788570404), ('H', 0.039195336401462555), ('E', 0.030201522633433342), ('P', 0.003795277327299118)]


Similarities between S and Y: -0.03352360427379608


### FastText Implementation

In [605]:
model = fasttext(words, win=3, count=1)
df = vectorization(model, column_list)
df.head()

Unnamed: 0,A,D,E,G,H,I,K,L,M,N,P,Q,R,S,T,V,Y
A,1.0,,0.027781,,0.089372,0.050571,0.017507,0.024187,-0.011526,,0.255207,0.080184,0.027909,,0.136553,,
D,,1.0,0.071162,0.250804,0.080937,,0.052465,,,0.0882165,,0.172613,0.06918,0.0305518,0.047187,0.0335818,
E,0.0277813,0.0711623,1.0,0.129586,0.054905,-0.019113,,,0.022034,,,0.125048,-0.000396,,-0.000761,,0.00213117
G,,0.250804,0.129586,1.0,0.021499,0.149231,0.025469,,,,0.013182,,0.158943,,0.223954,0.092773,0.116858
H,0.0893716,0.0809367,0.054905,,1.0,0.028436,0.148876,0.246249,0.038532,,,0.083931,0.159812,,0.101453,,


In [606]:
print(model.wv.most_similar('S'))
print("\n\nSimilarities between {} and {}: {}".format('S', 'Y', model.wv.similarity(w1='S', w2='Y')))

[('I', 0.14272816479206085), ('L', 0.12876690924167633), ('V', 0.11846521496772766), ('T', 0.04452924802899361), ('Y', 0.042162515223026276), ('D', 0.03055178001523018), ('K', 0.018864385783672333), ('G', 0.0036773981992155313), ('N', -0.0011194890830665827), ('M', -0.006847509648650885)]


Similarities between S and Y: 0.042162518948316574


### Doc2Vec Implementation

In [607]:
model = doc2vec(words, win=3, count=1)
df = vectorization(model, column_list)
df.head()

Unnamed: 0,A,D,E,G,H,I,K,L,M,N,P,Q,R,S,T,V,Y
A,1.0,,0.053782,,0.091795,,0.055824,0.020446,0.178463,,0.089823,0.009096,0.01619,,0.146735,,0.023635
D,,1.0,0.163616,0.108655,,0.00707399,,-0.012058,0.047962,0.0664287,0.024621,0.175141,,0.21766,0.050345,,
E,0.0537824,0.163616,1.0,0.0376254,0.023407,,,0.054555,,,,0.020108,0.164695,0.0351521,0.02937,,0.021597
G,,0.108655,0.037625,1.0,,,-0.002768,,,0.0189631,0.00715,,0.170589,-0.0209639,0.141026,0.152621,0.254778
H,0.0917952,,0.023407,,1.0,0.0553903,0.051316,0.032318,,-0.00423818,0.053891,,0.222756,0.0569123,0.074497,,


In [608]:
print(model.wv.most_similar('S'))
print("\n\nSimilarities between {} and {}: {}".format('S', 'Y', model.wv.similarity(w1='S', w2='Y')))

[('I', 0.2247517704963684), ('D', 0.21765977144241333), ('Q', 0.12032710760831833), ('V', 0.11342903226613998), ('K', 0.09980012476444244), ('N', 0.06943774968385696), ('L', 0.06745608150959015), ('H', 0.05691228806972504), ('E', 0.03515208885073662), ('P', 0.020192505791783333)]


Similarities between S and Y: -0.018541643396019936


### Bag of Words Implementation

In [699]:
dictionary = Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in words]
# print(BoW_corpus)
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]
print(id_words)

[[('A', 6), ('E', 1), ('G', 1), ('H', 2), ('L', 1), ('M', 1), ('N', 1), ('P', 4), ('Q', 2), ('R', 6), ('S', 5), ('T', 1), ('V', 2)], [('A', 1), ('E', 3), ('G', 1), ('H', 3), ('L', 2), ('P', 2), ('Q', 1), ('R', 6), ('S', 4), ('T', 3), ('V', 1), ('D', 1), ('I', 3), ('K', 2)], [('E', 1), ('G', 4), ('H', 1), ('L', 1), ('P', 5), ('Q', 1), ('R', 3), ('S', 9), ('T', 3), ('I', 3), ('Y', 2)], [('A', 5), ('E', 1), ('G', 2), ('L', 1), ('M', 1), ('N', 1), ('P', 8), ('Q', 2), ('R', 2), ('S', 7), ('K', 2), ('Y', 1)]]


### TFIDF

In [700]:
model = TfidfModel(BoW_corpus)  # fit model
vector = model[BoW_corpus]  # apply model to the first corpus document
        
id_words = [[(dictionary[id], count) for id, count in line] for line in vector]
print(id_words)

[[('A', 0.6890098285715666), ('H', 0.22966994285718884), ('M', 0.27668577328103633), ('N', 0.27668577328103633), ('T', 0.11483497142859442), ('V', 0.5533715465620727)], [('A', 0.08998587240483427), ('H', 0.2699576172145028), ('T', 0.2699576172145028), ('V', 0.21681383624658238), ('D', 0.43362767249316475), ('I', 0.6504415087397472), ('K', 0.43362767249316475)], [('H', 0.10816721615936337), ('T', 0.3245016484780901), ('I', 0.7818610343449305), ('Y', 0.5212406895632871)], [('A', 0.6171562174941082), ('M', 0.2973978103503706), ('N', 0.2973978103503706), ('K', 0.5947956207007412), ('Y', 0.2973978103503706)]]
