In [1]:
import numpy as np
import pandas as pd
import matplotlib as plot
from gensim.models import KeyedVectors


MODEL_PATH = "models/"

#load the models into RAM (at the same time should work...)
print("loading glove42")
glove42 = KeyedVectors.load('%sgolve.42B.300d.model'%MODEL_PATH)
print("loading twitter25")
twitter25 = KeyedVectors.load('%sgolve.twitter.27B.25d.model'%MODEL_PATH)
print("loading twitter50")
twitter50 = KeyedVectors.load('%sgolve.twitter.27B.50d.model'%MODEL_PATH)
print("loading twitter100")
twitter100 = KeyedVectors.load('%sgolve.twitter.27B.100d.model'%MODEL_PATH)
print("loading twitter200")
twitter200 = KeyedVectors.load('%sgolve.twitter.27B.200d.model'%MODEL_PATH)

loading glove42
loading twitter25
loading twitter50
loading twitter100
loading twitter200


In [2]:
def print_closest_words(vectors, targets):
     for t in targets:
         print("'%s':"%t)
         try:
             closest = vectors.most_similar(t)
         except KeyError:
             print("Not in vocabulary")
             continue
         for c in closest:
             print("\t%s : %.3f" %(c[0], c[1]))

targets = ["fuck", "damn", "god", "friggin", "darn", "gosh"]

In [3]:
def print_closest_words_latex(vectors, targets):
    print("\\begin{center}")
    print("\\begin{tabular}{ l l } ")
    print("\\hline")
    print("Word & Neighbors  \\\\")
    print("\\hline")
    for t in targets:
        row_str = "\\textbf{%s} & "%t
        closest = vectors.most_similar(t)
        for c in closest:
            row_str += "%s, " %(c[0])
        print("%s \\\\" %row_str[:-2]) # chop off last comma and space
        
    print("\\hline")
    print("\\end{tabular}")
    print("\\end{center}")

In [4]:
def print_closest_words_score_latex(vectors, targets):
    if len(targets) > 5: # it appears that only five columns can be visible on a page?
        print_closest_words_score_latex(vectors, targets[:5])
        print_closest_words_score_latex(vectors, targets[5:])
    else:
        row_strs = [""] * 11 # create an array of empty strings, 1 for the header (target word) and 10 for the 10 closest words
        print("\\begin{center}")
        table_str = "\\begin{tabular}{ |"
        for i in range(len(targets)):
            table_str += " c |"
        print("%s }" %table_str)
        print("\\hline")

        # first generate the individual rows
        for t in targets:
            row_strs[0] += "\\textbf{%s} & " %t
            closest = vectors.most_similar(t)
            for c in range(1, len(closest) + 1):
                row_strs[c] += "%s : %.2f & " %(closest[c - 1][0], closest[c - 1][1]) # we only need the distance to 2 dec places

        print("%s \\\\" %row_strs[0][:-2]) # chop off last '&' and space on the header words and the closest words
        print("\\hline")
        for i in range(1, len(row_strs)):
            print("%s \\\\" %row_strs[i][:-2]) 

        print("\\hline")
        print("\\end{tabular}")
        print("\\end{center}")    

In [7]:
print_closest_words(glove42, targets)

'fuck':
	fucking : 0.890085
	ass : 0.801807
	fucked : 0.786498
	suck : 0.774119
	bitch : 0.760884
	pussy : 0.754305
	shit : 0.747851
	fucks : 0.747322
	slut : 0.715903
	horny : 0.711789
'damn':
	freaking : 0.808849
	thats : 0.804998
	darn : 0.804346
	shit : 0.795101
	yeah : 0.795060
	stupid : 0.783023
	cuz : 0.771624
	freakin : 0.769442
	lol : 0.767663
	fuckin : 0.764639
'god':
	christ : 0.806945
	jesus : 0.797688
	lord : 0.756466
	heaven : 0.736455
	gods : 0.725088
	almighty : 0.721813
	faith : 0.719083
	holy : 0.708875
	pray : 0.704659
	divine : 0.703510
'frick':
	brauer : 0.446545
	frack : 0.429506
	guggenheim : 0.373678
	heinz : 0.366111
	mellon : 0.363646
	kemper : 0.360802
	carnegie : 0.356722
	coan : 0.350960
	museum : 0.350805
	farnsworth : 0.341526
'darn':
	darned : 0.842314
	damn : 0.804346
	freaking : 0.729577
	dang : 0.713294
	soooo : 0.701573
	freakin : 0.696519
	friggin : 0.693939
	pretty : 0.683317
	awfully : 0.679518
	sooo : 0.671035
'gosh':
	geez : 0.719487
	jeez : 0.6

In [9]:
print_closest_words(twitter25, targets)

'fuck':
	shit : 0.975124
	hell : 0.960364
	damn : 0.955875
	bitch : 0.955406
	dude : 0.951955
	smh : 0.942909
	lmao : 0.941495
	thats : 0.941091
	cause : 0.939942
	bad : 0.936998
'damn':
	shit : 0.969590
	hell : 0.967255
	dude : 0.958074
	thats : 0.956937
	fuck : 0.955875
	fuckin : 0.952355
	mad : 0.951902
	bad : 0.946971
	fucking : 0.943955
	like : 0.941989
'god':
	lord : 0.945264
	life : 0.922650
	true : 0.921773
	rest : 0.915424
	forget : 0.914680
	means : 0.907882
	it : 0.906304
	never : 0.905784
	remember : 0.905605
	what : 0.904669
'frick':
	lordy : 0.889460
	pahaha : 0.880130
	jheez : 0.875569
	pahahaha : 0.874996
	frig : 0.873169
	looooooooool : 0.870447
	dayumm : 0.870237
	dammn : 0.869075
	betcha : 0.867702
	daaamn : 0.866274
'darn':
	friggin : 0.945871
	effing : 0.932937
	fricken : 0.915874
	frickin : 0.913544
	freakin : 0.894171
	fricking : 0.881680
	effin : 0.880853
	jeez : 0.871655
	frigging : 0.870890
	goddamn : 0.866052
'gosh':
	woah : 0.951295
	yep : 0.936377
	lolol : 

In [11]:
print_closest_words(twitter200, targets)

'fuck':
	shit : 0.884610
	hell : 0.840813
	bitch : 0.831233
	fucking : 0.830012
	damn : 0.826227
	wtf : 0.819603
	fuckin : 0.791408
	n't : 0.790107
	that : 0.788328
	nigga : 0.784458
'damn':
	shit : 0.855683
	fuckin : 0.832268
	fuck : 0.826227
	smh : 0.817285
	hell : 0.815009
	fucking : 0.811158
	lmao : 0.806080
	that : 0.800900
	why : 0.800019
	thats : 0.797936
'god':
	lord : 0.851451
	jesus : 0.793370
	bless : 0.783336
	christ : 0.748248
	thank : 0.728486
	gods : 0.712267
	life : 0.707067
	praise : 0.692849
	holy : 0.691865
	you : 0.689168
'frick':
	frack : 0.661508
	heck : 0.631302
	fuuuuck : 0.615577
	frig : 0.612536
	fuuuuuck : 0.610375
	fucj : 0.599739
	fuq : 0.599668
	fuuuuuuck : 0.599290
	fuuck : 0.590608
	fuuuck : 0.587006
'darn':
	freakin : 0.743486
	friggin : 0.722513
	freaking : 0.713743
	gosh : 0.705851
	effing : 0.700326
	goddamn : 0.690690
	fricken : 0.682505
	effin : 0.678194
	damn : 0.665144
	dammit : 0.663934
'gosh':
	omg : 0.846523
	omfg : 0.842340
	ugh : 0.816337
	o

In [12]:
print_closest_words(glove42, ["hell", "heck"])

'hell':
	shit : 0.740012
	damn : 0.717443
	heck : 0.717442
	yeah : 0.700502
	crap : 0.690306
	gonna : 0.688813
	thing : 0.679486
	heaven : 0.674648
	thats : 0.665746
	nothing : 0.664930
'heck':
	yeah : 0.749803
	hell : 0.717442
	damn : 0.717155
	maybe : 0.715949
	anyway : 0.710780
	guess : 0.709965
	thing : 0.700194
	think : 0.699303
	thats : 0.699044
	wondering : 0.691883


In [2]:
print_closest_words(twitter100, ["hell", "heck"])

NameError: name 'print_closest_words' is not defined

In [10]:
chris_targets = ['speech', 'movie', 'video', 'mccain', 'america', 'tax', 'sex', 'fucking']
print_closest_words(glove42, chris_targets)

'speech':
	speeches : 0.648
	speaking : 0.645
	speach : 0.597
	language : 0.596
	voice : 0.589
	words : 0.586
	speak : 0.585
	spoken : 0.582
	remarks : 0.582
	speaks : 0.573
'movie':
	movies : 0.833
	film : 0.763
	films : 0.715
	starring : 0.655
	dvd : 0.652
	flick : 0.640
	soundtrack : 0.640
	trailer : 0.636
	cinema : 0.627
	picture : 0.624
'video':
	videos : 0.811
	youtube : 0.737
	clip : 0.694
	clips : 0.694
	hd : 0.677
	audio : 0.669
	footage : 0.658
	pictures : 0.642
	picture : 0.641
	streaming : 0.631
'mccain':
	romney : 0.854
	obama : 0.829
	hillary : 0.817
	barack : 0.785
	palin : 0.771
	biden : 0.766
	clinton : 0.754
	republicans : 0.741
	gop : 0.739
	huckabee : 0.729
'america':
	american : 0.745
	nation : 0.707
	europe : 0.707
	states : 0.688
	country : 0.687
	americas : 0.685
	world : 0.685
	americans : 0.669
	united : 0.664
	usa : 0.659
'tax':
	taxes : 0.872
	taxation : 0.777
	income : 0.743
	taxpayers : 0.665
	revenue : 0.664
	pay : 0.645
	taxpayer : 0.643
	payroll : 0.642

In [22]:
print_closest_words_latex(glove42, chris_targets)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{speech} & speeches, speaking, speach, language, voice, words, speak, spoken, remarks, speaks \\
\textbf{movie} & movies, film, films, starring, dvd, flick, soundtrack, trailer, cinema, picture \\
\textbf{video} & videos, youtube, clip, clips, hd, audio, footage, pictures, picture, streaming \\
\textbf{mccain} & romney, obama, hillary, barack, palin, biden, clinton, republicans, gop, huckabee \\
\textbf{america} & american, nation, europe, states, country, americas, world, americans, united, usa \\
\textbf{tax} & taxes, taxation, income, taxpayers, revenue, pay, taxpayer, payroll, federal, irs \\
\textbf{sex} & porn, lesbian, xxx, teen, sexual, gay, nude, porno, anal, sexy \\
\textbf{fucking} & fuck, ass, fucked, suck, fuckin, pussy, shit, fucks, damn, asshole \\
\hline
\end{tabular}
\end{center}


In [43]:
print_closest_words_score_latex(glove42, chris_targets)

\begin{center}
\begin{tabular}{ | c | c | c | c | c | }
\hline
\textbf{speech} & \textbf{movie} & \textbf{video} & \textbf{mccain} & \textbf{america}  \\
\hline
speeches : 0.65 & movies : 0.83 & videos : 0.81 & romney : 0.85 & american : 0.74  \\
speaking : 0.64 & film : 0.76 & youtube : 0.74 & obama : 0.83 & nation : 0.71  \\
speach : 0.60 & films : 0.72 & clip : 0.69 & hillary : 0.82 & europe : 0.71  \\
language : 0.60 & starring : 0.65 & clips : 0.69 & barack : 0.78 & states : 0.69  \\
voice : 0.59 & dvd : 0.65 & hd : 0.68 & palin : 0.77 & country : 0.69  \\
words : 0.59 & flick : 0.64 & audio : 0.67 & biden : 0.77 & americas : 0.69  \\
speak : 0.59 & soundtrack : 0.64 & footage : 0.66 & clinton : 0.75 & world : 0.68  \\
spoken : 0.58 & trailer : 0.64 & pictures : 0.64 & republicans : 0.74 & americans : 0.67  \\
remarks : 0.58 & cinema : 0.63 & picture : 0.64 & gop : 0.74 & united : 0.66  \\
speaks : 0.57 & picture : 0.62 & streaming : 0.63 & huckabee : 0.73 & usa : 0.66  \\
\hline


In [44]:
print_closest_words_latex(twitter100, chris_targets)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{speech} & speeches, speaking, letter, interview, during, president, presentation, conference, response, obama \\
\textbf{movie} & movies, episode, story, trailer, watching, watch, scene, twilight, watched, horror \\
\textbf{video} & videos, youtube, cover, vídeo, playlist, clip, audio, official, post, trailer \\
\textbf{mccain} & senator, boehner, congressman, palin, biden, mcconnell, republicans, kerry, christie, cheney \\
\textbf{america} & american, world, country, mexico, africa, canada, states, us, uk, believe \\
\textbf{tax} & taxes, debt, budget, pension, insurance, costs, cost, pay, welfare, government \\
\textbf{sex} & porn, pussy, anal, lesbian, naked, threesome, sexual, horny, boobs, hardcore \\
\textbf{fucking} & fuckin, freaking, fuck, damn, stupid, fucken, shit, freakin, hell, seriously \\
\hline
\end{tabular}
\end{center}


In [69]:
targets = ["fuck", "damn", "bitch", "hell", "shit"]
print_closest_words_latex(glove42, targets)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{fuck} & fucking, ass, fucked, suck, bitch, pussy, shit, fucks, slut, horny \\
\textbf{damn} & freaking, thats, darn, shit, yeah, stupid, cuz, freakin, lol, fuckin \\
\textbf{bitch} & bitches, fuck, ass, whore, slut, fucking, shit, fucker, suck, asshole \\
\textbf{hell} & shit, damn, heck, yeah, crap, gonna, thing, heaven, thats, nothing \\
\textbf{shit} & crap, fuckin, damn, cuz, thats, lol, dude, fuck, stupid, sht \\
\hline
\end{tabular}
\end{center}


In [71]:
targets_mild = ["dang", "darn", "heck", "gosh", "jeez", "hella", "freaking", "damn", "shit"]
print_closest_words_latex(glove42, targets_mild)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{dang} & darn, damn, gosh, freaking, soo, freakin, friggin, darned, yah, goddamn \\
\textbf{darn} & darned, damn, freaking, dang, soooo, freakin, friggin, pretty, awfully, sooo \\
\textbf{heck} & yeah, hell, damn, maybe, anyway, guess, thing, think, thats, wondering \\
\textbf{gosh} & geez, jeez, gawd, omg, ohh, ohhh, ahhh, ahh, darn, ugh \\
\textbf{jeez} & geez, geeze, jeeze, sheesh, damnit, gosh, ummm, dammit, c'mon, woah \\
\textbf{hella} & soooo, sooo, sooooo, fuckin, freakin, soooooo, gettin, damn, kinda, soo \\
\textbf{freaking} & freakin, friggin, damn, effing, goddamn, fcking, fuckin, darn, frickin, fking \\
\textbf{damn} & freaking, thats, darn, shit, yeah, stupid, cuz, freakin, lol, fuckin \\
\textbf{shit} & crap, fuckin, damn, cuz, thats, lol, dude, fuck, stupid, sht \\
\hline
\end{tabular}
\end{center}


In [72]:
print_closest_words_latex(twitter100, targets_mild)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{dang} & damn, dam, nahh, soo, nah, huh, idk, gah, noh, lol \\
\textbf{darn} & friggin, freakin, effing, freaking, fricken, gosh, goddamn, effin, frickin, fucken \\
\textbf{heck} & hell, wth, jeez, gosh, dammit, woah, damn, kidding, crap, yeah \\
\textbf{gosh} & omfg, omg, ohmygod, ugh, srsly, jeez, omgg, freaking, dammit, seriously \\
\textbf{jeez} & geez, sheesh, gosh, geeze, jeeez, geesh, damnit, heck, dammit, woah \\
\textbf{hella} & mad, shits, kinda, gettin, lowkey, fuckin, damn, dumb, slick, alot \\
\textbf{freaking} & freakin, fricken, effing, fucking, fucken, friggin, frickin, seriously, gosh, fricking \\
\textbf{damn} & shit, hell, fuck, fuckin, thats, stupid, like, fucking, smh, dude \\
\textbf{shit} & fuck, damn, hell, bitch, like, nigga, that, dude, thats, smh \\
\hline
\end{tabular}
\end{center}


In [10]:
targets_intense = ["damn","shit","fuck", "bitch", "cunt", "dick"]
print_closest_words_latex(glove42, targets_intense)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{damn} & freaking, thats, darn, shit, yeah, stupid, cuz, freakin, lol, fuckin \\
\textbf{shit} & crap, fuckin, damn, cuz, thats, lol, dude, fuck, stupid, sht \\
\textbf{fuck} & fucking, ass, fucked, suck, bitch, pussy, shit, fucks, slut, horny \\
\textbf{bitch} & bitches, fuck, ass, whore, slut, fucking, shit, fucker, suck, asshole \\
\textbf{cunt} & pussy, twat, asshole, cunts, cock, ass, fucking, clit, slut, fuck \\
\textbf{dick} & cock, dicks, sucking, suck, ass, pussy, fucking, fuck, cocks, cunt \\
\hline
\end{tabular}
\end{center}


In [11]:
print_closest_words_latex(twitter200, targets_intense)

\begin{center}
\begin{tabular}{ l l } 
\hline
Word & Neighbors  \\
\hline
\textbf{damn} & shit, fuckin, fuck, smh, hell, fucking, lmao, that, why, thats \\
\textbf{shit} & fuck, damn, that, nigga, hell, thats, bitch, like, smh, lmao \\
\textbf{fuck} & shit, hell, bitch, fucking, damn, wtf, fuckin, n't, that, nigga \\
\textbf{bitch} & nigga, fuck, ass, bitches, hoe, shit, girl, fuckin, damn, cuz \\
\textbf{cunt} & twat, prick, bastard, cunts, wanker, dickhead, faggot, fucker, asshole, bellend \\
\textbf{dick} & suck, ass, cock, pussy, bitch, dicks, nigga, penis, asshole, fuck \\
\hline
\end{tabular}
\end{center}


In [55]:
file = open("data/swearWords.csv", "r") # from http://www.bannedwordlist.com
lines = file.readline()
all_swears = lines.split(",")

In [61]:
print_closest_words(glove42, all_swears)

'anal':
	threesome : 0.740
	pussy : 0.732
	blowjob : 0.726
	porn : 0.724
	hardcore : 0.706
	fucked : 0.700
	dildo : 0.699
	interracial : 0.699
	sex : 0.692
	gangbang : 0.691
'anus':
	rectum : 0.747
	vagina : 0.697
	vulva : 0.631
	genitals : 0.619
	anal : 0.618
	scrotum : 0.614
	clitoris : 0.591
	pussy : 0.586
	asshole : 0.575
	cunt : 0.572
'arse':
	ass : 0.727
	asses : 0.672
	asshole : 0.661
	cunt : 0.630
	arses : 0.628
	butt : 0.627
	bum : 0.612
	twat : 0.600
	@ss : 0.598
	prick : 0.577
'ass':
	pussy : 0.805
	fucking : 0.803
	fuck : 0.802
	butt : 0.775
	asses : 0.769
	fucked : 0.766
	asshole : 0.760
	bitch : 0.744
	booty : 0.737
	cunt : 0.730
'ballsack':
	nutsack : 0.585
	chode : 0.410
	vajayjay : 0.394
	dingleberries : 0.374
	dck : 0.373
	ballsacks : 0.371
	asssss : 0.365
	d!ck : 0.364
	douchbag : 0.360
	schnoz : 0.357
'balls':
	ball : 0.793
	sticks : 0.609
	throw : 0.576
	stick : 0.573
	throwing : 0.560
	hitting : 0.551
	bowling : 0.550
	toss : 0.536
	holes : 0.534
	thrown : 0.519
'

	fart : 0.599
	feces : 0.598
	shit : 0.575
	piss : 0.564
	poops : 0.554
'prick':
	asshole : 0.681
	cunt : 0.665
	pricks : 0.654
	cock : 0.605
	twat : 0.601
	throbbing : 0.580
	arse : 0.577
	dick : 0.552
	ass : 0.536
	pussy : 0.532
'pube':
	pubes : 0.408
	pubic : 0.357
	shaven : 0.354
	freckle : 0.344
	uni-brow : 0.342
	salt-and-pepper : 0.338
	hair-do : 0.334
	peach-fuzz : 0.333
	cut/color : 0.331
	cowlick : 0.330
'pussy':
	cunt : 0.852
	cock : 0.810
	ass : 0.805
	tits : 0.788
	fucked : 0.784
	fuck : 0.754
	slut : 0.754
	sucking : 0.743
	licking : 0.742
	hairy : 0.740
'queer':
	transgender : 0.656
	lgbt : 0.640
	feminist : 0.639
	lgbtq : 0.612
	gay : 0.593
	glbt : 0.580
	feminism : 0.567
	homosexual : 0.560
	sexuality : 0.548
	transgendered : 0.538
'scrotum':
	testicles : 0.745
	testicle : 0.658
	anus : 0.614
	rectum : 0.611
	perineum : 0.591
	vulva : 0.589
	scrotal : 0.572
	testes : 0.569
	urethra : 0.566
	abdomen : 0.554
'sex':
	porn : 0.816
	lesbian : 0.742
	xxx : 0.740
	teen : 0.72

In [73]:
print_closest_words(glove42, ["heck"])

'heck':
	yeah : 0.750
	hell : 0.717
	damn : 0.717
	maybe : 0.716
	anyway : 0.711
	guess : 0.710
	thing : 0.700
	think : 0.699
	thats : 0.699
	wondering : 0.692
