In [1]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from gensim.models import word2vec

In [2]:
path = '../tweets/en/all.csv'

stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer('\w+')

In [3]:
def preprocess(text, tokenizer=tokenizer, stemmer=stemmer):
    '''Preprocesses a line and returns a list of preprocessed tokens'''
    
    # lower case and remove leading/trailing spaces
    text = text.lower().strip()
    
    tokens = []
    
    for token in text.split():
        # skip links and user handles
        if token.startswith('https://') or token.startswith('@'):
            continue
            
        tokens.append(token)
    
    # tokenize the text
    tokens = tokenizer.tokenize(' '.join(tokens))
    
    # apply stemmer to each token
#     tokens = list(map(stemmer.stem, tokens))
    
    return tokens

In [None]:
tweets = []

with open(path, 'r', newline='\r\n') as f:
    for i, line in enumerate(f.readlines()):
        n, tweet = line.split(',', maxsplit=1)
        tokens = preprocess(tweet)
        
        tweets.append(tokens)

In [None]:
params = [
    (50, 5, 3, 0),
    (150, 3, 3, 0),
    (50, 2, 0, 0),
    (50, 5, 3, 1),
    (150, 3, 3, 1),
    (50, 2, 0, 1),
]

models = [word2vec.Word2Vec(sentences=tweets, size=size, window=window, min_count=min_count, sg=sg) 
         for size, window, min_count, sg in params]


In [None]:
for model in models:
    for x in model.wv.most_similar(positive=['war']):
        print(x)
    print('-----------------')