In [1]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models import word2vec

In [3]:
path = '../tweets/en/all.csv'

stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer('\w+')

In [4]:
def preprocess(text, tokenizer=tokenizer, stemmer=stemmer):
    '''Preprocesses a line and returns a list of preprocessed tokens'''
    
    # lower case and remove leading/trailing spaces
    text = text.lower().strip()
    
    tokens = []
    
    for token in text.split():
        # skip links and user handles
        if token.startswith('https://') \
        or token.startswith('@') \
        or token in stopwords.words('english'):
            continue
            
        tokens.append(token)
    
    # tokenize the text
    tokens = tokenizer.tokenize(' '.join(tokens))
    
    # apply stemmer to each token
#     tokens = list(map(stemmer.stem, tokens))
    
    return tokens

In [5]:
tweets = []

with open(path, 'r', newline='\r\n') as f:
    for i, line in enumerate(f.readlines()):
        n, tweet = line.split(',', maxsplit=1)
        tokens = preprocess(tweet)
        
        tweets.append(tokens)

In [6]:
params = [
    (50, 5, 3, 0),
    (150, 3, 3, 0),
    (50, 2, 0, 0),
    (50, 5, 3, 1),
    (150, 3, 3, 1),
    (50, 2, 0, 1),
]

models = [word2vec.Word2Vec(sentences=tweets, size=size, window=window, min_count=min_count, sg=sg) 
         for size, window, min_count, sg in params]


In [7]:
for model in models:
    for x in model.wv.most_similar(positive=['putin']):
        print(x)
    print('-----------------')

('americanism', 0.5397476553916931)
('message', 0.5394257307052612)
('crater', 0.5337719917297363)
('basher', 0.5284423232078552)
('1nrsmith', 0.5280888676643372)
('almond', 0.5227072834968567)
('whatever', 0.5168656706809998)
('spamming', 0.5158582329750061)
('slum', 0.5113565325737)
('he', 0.5091564059257507)
-----------------
('almond', 0.6010277271270752)
('thegoodfight', 0.5565204620361328)
('monster', 0.5561071634292603)
('confidants', 0.5517393946647644)
('unconcerned', 0.5514921545982361)
('russians', 0.551446259021759)
('1nrsmith', 0.5481245517730713)
('milne', 0.5474159121513367)
('clans', 0.5381943583488464)
('he', 0.5362292528152466)
-----------------
('admini', 0.7923679351806641)
('lisambrauer', 0.7569103240966797)
('spamming', 0.7110344767570496)
('someone', 0.7107893824577332)
('loverboy', 0.7075369954109192)
('gutfeld', 0.7050368785858154)
('glowing', 0.7023209929466248)
('rusdia', 0.7011188864707947)
('going2', 0.6998761892318726)
-----------------
('russia', 0.713751