In [10]:
import os
import sys
import time
import pandas as pd
from contextlib import contextmanager
from gensim.models import word2vec, KeyedVectors, FastText
from keras.preprocessing.text import text_to_word_sequence

sys.path.append("../input/toxic-src")
from logger import setup_logger, LOGGER


# ===============
# Constants
# ===============
SAVE_DIR = "./"
DATA_DIR = "../input/jigsaw-unintended-bias-in-toxicity-classification"
LOGGER_PATH = os.path.join(SAVE_DIR, "log.txt")
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH = os.path.join(DATA_DIR, "test.csv")
SUB_PATH = os.path.join(DATA_DIR, "sample_submission.csv")


# ===============
# Settings
# ===============
w2v_params = {
    "size": 300,
    "iter": 5,
    "seed": 0,
    "min_count": 1,
    "workers": 1
}
save_path = "exp1_w2v_selftrain_nopreprocess.model"
setup_logger(out_file=LOGGER_PATH)


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')

    
model = word2vec.Word2Vec.load("exp8_w2v_finetune_nopreprocess.model")
#model = FastText.load("exp7_w2v_finetune_preprocess.model")

2019-06-11 13:49:54,225 - INFO - logger set up
2019-06-11 13:49:54,227 - INFO - loading Word2Vec object from exp8_w2v_finetune_nopreprocess.model
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-06-11 13:49:55,153 - INFO - loading wv recursively from exp8_w2v_finetune_nopreprocess.model.wv.* with mmap=None
2019-06-11 13:49:55,155 - INFO - loading vectors from exp8_w2v_finetune_nopreprocess.model.wv.vectors.npy with mmap=None
2019-06-11 13:49:55,333 - INFO - setting ignored attribute vectors_norm to None
2019-06-11 13:49:55,334 - INFO - loading vocabulary recursively from exp8_w2v_finetune_nopreprocess.model.vocabulary.* with mmap=None
2019-06-11 13:49:55,335 - INFO - loading trainables recursively from exp8_w2v_finetune_nopreprocess.model.trainables.* with mmap=None
2019-06-11 13:49:55,336 - INFO - loading syn1neg from exp8_w2v_finetune_nopreprocess.model.trainables.syn1neg.npy with mmap=None
2019-06-11 13:49:55,518 - INFO - setting ignored attribute cum_table to

In [11]:
model.most_similar("trump")

  """Entry point for launching an IPython kernel.
2019-06-11 13:49:56,263 - INFO - precomputing L2-norms of word weight vectors


[('drumpf', 0.7466673851013184),
 ('djt', 0.6872212290763855),
 ("trump's", 0.6832065582275391),
 ('hillary', 0.6595946550369263),
 ('obama', 0.6556358933448792),
 ('hrc', 0.6482384204864502),
 ('trumpster', 0.6405925750732422),
 ('trumps', 0.6315802335739136),
 ('putin', 0.6263394951820374),
 ('trump’s', 0.5996044278144836)]

In [12]:
model.most_similar("cheetolini")

  """Entry point for launching an IPython kernel.


[('drumpf', 0.5882811546325684),
 ('putrumpski', 0.5520138144493103),
 ('trump', 0.5487068295478821),
 ('trumpy', 0.5007345676422119),
 ('trumpster', 0.49820977449417114),
 ('donnie', 0.4793684780597687),
 ('djt', 0.47891244292259216),
 ('donny', 0.4629574716091156),
 ('hitlery', 0.45693138241767883),
 ('rump', 0.4471845328807831)]

In [13]:
model.most_similar("washingtontimes")

  """Entry point for launching an IPython kernel.


[('sep', 0.45279839634895325),
 ('torontosun', 0.44154879450798035),
 ('jun', 0.4349125921726227),
 ('apr', 0.42696312069892883),
 ('feb', 0.42309853434562683),
 ('trf', 0.415429025888443),
 ('oct', 0.4141014516353607),
 ("16's", 0.4094734787940979),
 ('jul', 0.40675458312034607),
 ('juli', 0.404865026473999)]

In [14]:
model.most_similar("obama")

  """Entry point for launching an IPython kernel.


[("obama's", 0.7226218581199646),
 ('obamas', 0.6658400893211365),
 ('obama’s', 0.6594345569610596),
 ('trump', 0.6556358933448792),
 ('obummer', 0.6504493951797485),
 ('bush', 0.6445512175559998),
 ('reagan', 0.6410301923751831),
 ('clinton', 0.6241167783737183),
 ('hillary', 0.6221505403518677),
 ('nixon', 0.6079533696174622)]

In [15]:
model.most_similar("lgbt")

  """Entry point for launching an IPython kernel.


[('lgbtq', 0.8726180791854858),
 ('gay', 0.7596004009246826),
 ('lbgt', 0.7144193053245544),
 ('glbt', 0.6499179005622864),
 ('lgtb', 0.6181888580322266),
 ('gays', 0.597923219203949),
 ('transgender', 0.5967017412185669),
 ('lbgtq', 0.5892442464828491),
 ('homosexual', 0.5839424133300781),
 ('transgendered', 0.5393625497817993)]

In [16]:
model.most_similar("brexit")

  """Entry point for launching an IPython kernel.


[('eu', 0.6005613803863525),
 ('referendum', 0.5485637784004211),
 ('ukip', 0.5389471054077148),
 ('corbyn', 0.5186786651611328),
 ('macron', 0.5114759206771851),
 ('snp', 0.48355725407600403),
 ('merkel', 0.4719584584236145),
 ('ceta', 0.4699776768684387),
 ('uk', 0.4677874445915222),
 ('nafta', 0.46644729375839233)]

In [17]:
model.most_similar("coinbase")

  """Entry point for launching an IPython kernel.


[('cryptocurrencies', 0.6650749444961548),
 ('segwit', 0.6462502479553223),
 ('coinmarketcap', 0.6357772350311279),
 ('1btc', 0.6234657764434814),
 ('mtgox', 0.6228510141372681),
 ('btsx', 0.5971863865852356),
 ('lbry', 0.5885393619537354),
 ('steemit', 0.5440430045127869),
 ('bcoin', 0.5330603718757629),
 ('jnug', 0.5238490104675293)]

In [18]:
model.most_similar("tensorflow")

  """Entry point for launching an IPython kernel.


[('twitterbot', 0.6120052337646484),
 ('repo1', 0.5949381589889526),
 ('apachesolr', 0.5929769277572632),
 ('quandl', 0.5493236184120178),
 ('win10', 0.5470220446586609),
 ('linuz', 0.5452548265457153),
 ('ipqs', 0.5449423789978027),
 ('undebateable', 0.542043924331665),
 ('ujseful', 0.541039764881134),
 ('neuroeducation', 0.5360795259475708)]

In [19]:
model.most_similar("gdpr")

  """Entry point for launching an IPython kernel.


[('cybersecure', 0.5592607855796814),
 ('brexit’s', 0.5505238175392151),
 ("brexiteers'", 0.5497649908065796),
 ("brexiter's", 0.5493022203445435),
 ('manzama', 0.5478142499923706),
 ('indyref2', 0.5404115915298462),
 ("brexit's", 0.537091076374054),
 ('‘brexit’', 0.5340575575828552),
 ('ab60', 0.5289008617401123),
 ("'brexit", 0.5273776054382324)]

In [20]:
model.most_similar("0bama")

  """Entry point for launching an IPython kernel.


[("0bama's", 0.8263639807701111),
 ('obozo', 0.7092679142951965),
 ('odumbo', 0.6500077247619629),
 ('obumbler', 0.6499822735786438),
 ('obummer', 0.6473488807678223),
 ('obomber', 0.6458436250686646),
 ('oblahblah', 0.6401388049125671),
 ('obamao', 0.62488853931427),
 ("obozo's", 0.6242970824241638),
 ('oblamer', 0.618243932723999)]

In [21]:
model.most_similar("germnay")

  """Entry point for launching an IPython kernel.


[('gemany', 0.5356997847557068),
 ('germay', 0.5336135625839233),
 ('switserland', 0.48412731289863586),
 ('wurzburg', 0.4474010765552521),
 ('beligum', 0.44530755281448364),
 ('wuerzburg', 0.4449155032634735),
 ('swizerland', 0.4407042860984802),
 ('signapore', 0.43891048431396484),
 ('czechoslavakia', 0.4327426552772522),
 ('rhineland', 0.4297659695148468)]

In [22]:
model.most_similar("compresses")

  """Entry point for launching an IPython kernel.


[("'compressing", 0.7806158661842346),
 ("compression's", 0.7714491486549377),
 ('compress', 0.7572919130325317),
 ('compressing', 0.6964548826217651),
 ('compressed', 0.5620607733726501),
 ('compressible', 0.5066367983818054),
 ("decompress'", 0.5059976577758789),
 ('condenses', 0.5032535791397095),
 ('optimizes', 0.49175384640693665),
 ('encrypts', 0.48074761033058167)]

In [23]:
model.most_similar("germeny")

  """Entry point for launching an IPython kernel.


[('mtf1943', 0.9934194087982178),
 ('hillarophobia', 0.9933812022209167),
 ('50921721165d', 0.9933504462242126),
 ('“that”s', 0.9932843446731567),
 ('mpur', 0.9932708740234375),
 ('2016🌟🇺🇸', 0.993253767490387),
 ('winningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinningwinning',
  0.9932524561882019),
 ('8…', 0.9932365417480469),
 ('incocet', 0.9932276606559753),
 ('0ahukewin3n', 0.9932162761688232)]

In [24]:
model.most_similar("qur'an")

  """Entry point for launching an IPython kernel.


[('quran', 0.7845902442932129),
 ('koran', 0.7614692449569702),
 ('hadith', 0.7100853323936462),
 ("qu'ran", 0.6738560199737549),
 ('verses', 0.6487593650817871),
 ('qur’an', 0.637866199016571),
 ('bible', 0.6348029375076294),
 ('torah', 0.6241525411605835),
 ('scriptures', 0.6109548807144165),
 ('hadiths', 0.6108025312423706)]

In [25]:
model.most_similar("deplorables")

  """Entry point for launching an IPython kernel.


[('deplorable', 0.529308557510376),
 ('supporters', 0.5126259326934814),
 ('racists', 0.5060428380966187),
 ('irredeemable', 0.5056288242340088),
 ('trumpsters', 0.4883350431919098),
 ('bigots', 0.47753816843032837),
 ('rubes', 0.4668005704879761),
 ('chumps', 0.4466981589794159),
 ('libtards', 0.4453332722187042),
 ('magaphants', 0.4394519329071045)]