In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from tqdm import tqdm
import spacy
from gensim import corpora, models
import pickle
import pandas as pd
from pprint import pprint
import csv 
import classifier

import sys
sys.path.insert(0,'.')
sys.path.insert(0,'/data_big/mlp/custom_lda2vec/lda2vec-pytorch/utils')
# from utils import preprocess, get_windows
# from utils.preprocess import preprocess
from preprocess_mod import *
from get_windows_mod import *

# import nltk
# nltk.download('stopwords')
    
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['via'])
import re


paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [3]:
!ls ..

20newsgroups  README.md			      utils	   y_test.pkl
LICENSE       semeval2016-task6-domaincorpus  X_test.pkl   y_train.pkl
loss.png      stance			      X_train.pkl


In [4]:
MIN_COUNTS = 10
MAX_COUNTS = 1800
# words with count < MIN_COUNTS
# and count > MAX_COUNTS
# will be removed

MIN_LENGTH = 3
# minimum document length 
# (number of words)
# after preprocessing

# half the size of the context around a word
HALF_WINDOW_SIZE = 1
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [5]:
nlp = spacy.load('en')

# Load dataset

In [6]:
with open('slangs.csv', mode='r') as infile:
    reader = csv.reader(infile) 
    slang_words = {rows[0]:rows[1] for rows in reader}

In [7]:
def clean_tweets(sent):
    sent = str(sent)
    
    # Substitute Slangs
    for word in sent.split(" "):
        if word in slang_words.keys():
            sent = re.sub(word, slang_words[word], sent)
            
    # Remove new line characters
    sent = re.sub('\s+', ' ', sent)

    # Remove http:// links
    sent = re.sub('http:\/\/.*','', sent)

    # Remove https:// links
    sent = re.sub('https:\/\/.*','', sent)
    
    # Remove distracting single quotes
    sent = re.sub("\'", "", sent)

    # Remove distracting single quotes
    sent = re.sub("\"", "", sent)

    # Remove hashtags
    sent = re.sub("\#", "", sent)
    
    sent = sent.lower()
    
    return sent

In [8]:
PATH_LABELLED_DATA_TRUMP = "../semeval2016-task6-domaincorpus/data-all-annotations/testdata-taskB-all-annotations.txt"
PATH_UNLABELLED_DATA_TRUMP = "./../semeval2016-task6-domaincorpus/downloaded_Donald_Trump.txt"

In [None]:
X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')
y_train = pd.read_pickle('y_train.pkl')
y_test = pd.read_pickle('y_test.pkl')

data_labelled_train = pd.concat([X_train, y_train], ignore_index=True, axis=1)
data_labelled_train.columns = ['Tweet', 'Stance']
data_labelled_test = pd.concat([X_test, y_test], ignore_index=True, axis=1)
data_labelled_test.columns = ['Tweet', 'Stance']

In [None]:
data_unlabelled = pd.read_csv(PATH_UNLABELLED_DATA_TRUMP, sep='\t', lineterminator='\n', encoding ='latin1', names = ["ID", "Tweet"])
data_unlabelled = data_unlabelled.where(data_unlabelled.Tweet != 'Not Available')
data_unlabelled.dropna(how='any', inplace=True)
data_unlabelled['Tweet'] = data_unlabelled['Tweet'].apply(lambda x: x[1:])
data_unlabelled['Tweet'] = data_unlabelled['Tweet'].apply(clean_tweets)

In [None]:
docs = [(i, doc.lower(), "test", data_labelled_test['Stance'][i]) for i, doc in enumerate(data_labelled_test['Tweet'])]
docs += [(i, doc.lower(), "train", data_labelled_train['Stance'][i]) for i, doc in enumerate(data_labelled_train['Tweet'])]
docs += [(i, doc.lower(), "unlabelled", "UNK") for i, doc in enumerate(data_unlabelled['Tweet'])]

In [None]:
len(docs)

54513

In [None]:
pprint(docs[:5])

[(0,
  'we need a man that will do the tough negotiations, say what needs to be '
  'said and forget the words politically correct. thanks semst',
  'test',
  'AGAINST'),
 (1,
  'we love mexicans,we respect your work ethic,your love of family,your '
  'loyalty,your food &your love of god! lets all get jobs,vote semst',
  'test',
  'AGAINST'),
 (2,
  'extremistprogressives are so focused on their agenda that they believe a '
  'inanimate flagkills but multitimedeporteesdont semst',
  'test',
  'NONE'),
 (3,
  '@braveconwarrior stories like this make think patriots are at the end of '
  'the string. all kinds of shit is going to hit fan semst',
  'test',
  'AGAINST'),
 (4,
  'gop candidate predictions? america decides to play the trump card and go '
  'with the man with the plan. semst',
  'test',
  'FAVOR')]


# Preprocess dataset and create windows

In [None]:
encoded_docs, decoder, word_counts = preprocess(
    docs, nlp, MIN_LENGTH, MIN_COUNTS, MAX_COUNTS
)

 21%|██        | 11347/54513 [01:45<06:46, 106.08it/s]

In [None]:
encoded_docs[:5]

In [None]:
len(word_counts)

In [None]:
len(encoded_docs)

In [None]:
# new ids will be created for the documents.
# create a way of restoring initial ids:
doc_decoder = {i: doc_id for i, (doc_id, doc, type, stance) in enumerate(encoded_docs)}
doc_decoder_reverse = {doc_id : i for i, (doc_id, doc, type, stance) in enumerate(encoded_docs)} 
# doc_decoder_all = {i: docs[doc_id] for i, (doc_id, doc, type, stance) in enumerate(encoded_docs)}

In [None]:
data = []
test_data = []
all_data = []
# new ids are created here
for index, (_, doc, type, stance) in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    if type == "train" or type == "unlabelled":
        data += [[index, w[0]] + w[1] for w in windows]
    if type == "test":
        test_data += [[index, w[0]] + w[1] for w in windows]
    all_data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')
test_data = np.array(test_data, dtype='int64')

In [None]:
test_data.shape[1]

In [None]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

In [None]:
test_data.shape[0]

In [None]:
# number of windows (equals to the total number of tokens)
data.shape[0]

# Get unigram distribution

In [None]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

In [None]:
word_counts.shape[0]

# Prepare word vectors

In [None]:
# print(decoder)

In [None]:
# print(encoded_docs)

In [None]:
# %%time
vocab_size = len(decoder)
embedding_dim = 100

# train a skip-gram word2vec model
texts = [[str(j) for j in doc] for i, doc, type, stance in encoded_docs]

In [None]:
model = models.Word2Vec(texts, size=embedding_dim, window=5, workers=4, sg=1, negative=15, iter=70)
model.init_sims(replace=True)

In [None]:
word_vectors = np.zeros((vocab_size, embedding_dim)).astype('float32')
for i in decoder:
#     print(str(i))
    if str(i) == '3469':
        continue
    word_vectors[i] = model.wv[str(i)]

In [None]:
# number of unique words
print(len(word_vectors))

# Prepare initialization for document weights

In [None]:
texts = [[decoder[j] for j in doc] for i, doc, type, stance in encoded_docs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# corpus

In [None]:
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# !unzip mallet-2.0.8.zip

In [None]:
n_topics = 20
mallet_path = './mallet-2.0.8/bin/mallet' # update this path
ldamallet = models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=n_topics, id2word=dictionary)
ldamallet = models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
corpus_lda_ldamallet = ldamallet[corpus]

In [None]:
# %%time

# lda = models.LdaModel(corpus, alpha=0.9, id2word=dictionary, num_topics=n_topics)
# corpus_lda = lda[corpus]
lda = ldamallet
corpus_lda = corpus_lda_ldamallet

In [None]:
for i, topics in lda.show_topics(n_topics, formatted=False):
    print('topic', i, ':', ' '.join([t for t, _ in topics]))

In [None]:
doc_weights_init = np.zeros((len(corpus_lda), n_topics))
for i in tqdm(range(len(corpus_lda))):
    topics = corpus_lda[i]
    for j, prob in topics:
        doc_weights_init[i, j] = prob

# Save data

In [None]:
# mallet_feats = []

# for i in range(len(corpus_lda)):
#     representation = lda.get_document_topics(corpus_lda[i])
#     count = 0
#     feat = []
#     for i in range(n_topics):
#         if i in list(map((lambda x: x[0]), representation)):
#             feat.append(representation[count][1])
#             count = count + 1
#         else:
#             feat.append(0)
#     mallet_feats.append(feat)

In [None]:
# mallet_feats

In [None]:
# doc_weights_init = np.array(mallet_feats)

In [None]:
doc_weights_init.shape[0]

In [None]:
def transform_labels(label):
    if label == 'AGAINST':
        return 0
    elif label == 'FAVOR':
        return 1
    elif label == 'NONE':
        return 2

x_train = []
x_test = []
y_train = []
y_test = []

# doc_decoder is mapping from new id to old id
# for i in range(len(doc_decoder)):
# print(doc_decoder)
encoded_docs_dict = {}
for i, (j, doc, type, stance) in enumerate(encoded_docs):
#     print(doc, stance)
#     if int(j) in doc_decoder_reverse:
#         i = doc_decoder_reverse[int(j)]
    encoded_docs_dict[i] = [j, type, stance]
    if type == "train":
        y_train.append(transform_labels(stance))
        x_train.append(doc_weights_init[i].tolist())
    if type == "test":
        y_test.append(transform_labels(stance))
        x_test.append(doc_weights_init[i].tolist())


In [None]:
print(len(x_train), len(x_train[0]))
print(len(y_train))
print(len(x_test), len(y_test))

In [None]:
# y_pred, mallet_report = classifier.RandomForest(x_train, x_test, y_train, y_test)
# pprint(mallet_report)

In [None]:
# np.save('data.npy', data)
np.save('docs.npy', docs)
np.save('encoded_docs.npy', encoded_docs_dict)
np.save('data.npy', all_data)
np.save('data_ids.npy', data[:,0])
np.save('test_data.npy', test_data)
np.save('word_vectors.npy', word_vectors)
np.save('unigram_distribution.npy', unigram_distribution)
np.save('decoder.npy', decoder)
np.save('doc_decoder.npy', doc_decoder)
np.save('doc_decoder_reverse.npy', doc_decoder_reverse)
# np.save('doc_decoder_all.npy', doc_decoder_all)
np.save('doc_weights_init.npy', doc_weights_init)

In [None]:
print(docs[269])