In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan  4 05:38:46 2019/modified on Wed May 22 2019

@author: chriscochrane/michaelwcwong
"""

import re

import pandas as pd
import nltk
import os
import numpy as np
import sys
from nltk.corpus import stopwords
import time
import random

import gensim

from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models import Phrases
import logging

In [None]:
stopwords_ = stopwords.words('english')


hansardSpeeches = pd.read_csv('hansardExtractedSpeechesFull.csv', sep="\t", encoding="utf-8", header=0) 



print(hansardSpeeches['mentionedEntityName'][1])

In [None]:
def sentence_to_wordlist(sentence, remove_stopwords=False):
    sentence_text = re.sub(r'[^\w\s]','', sentence)
    words = sentence_text.lower().split()

    for word in words: #Remove Stopwords (Cochrane)
        if word in stopwords_:
            words.remove(word)

    return words

def hansard_to_sentences(hansard, tokenizer, remove_stopwords=False ):
    #print("currently processing: word tokenizer")
    start_time = time.time()
    try:
        # 1. Use the NLTK tokenizer to split the text into sentences
        raw_sentences = tokenizer.tokenize(hansard.strip())
        #raw_sentences = [sentence_to_wordlist(raw_sentence) for raw_sentence in raw_sentences]
        #sentences = [sentence for sublist in raw_sentences for sentence in sublist]
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call sentence_to_wordlist to get a list of words
                sentences.append(sentence_to_wordlist(raw_sentence))
        # 3. Return the list of sentences (each sentence is a list of words, so this returns a list of lists)
        #print(len(sentences))
        return sentences
    except:
        print('nope')

    end_time = time.time()-start_time

In [None]:
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

def speech_tokenizer(hansard):
    sentences = []
    try:
        # Need to first change "./." to "." so that sentences parse correctly
        hansard = hansard.replace("/.","")
        sentences += hansard_to_sentences(hansard, tokenizer)
    except:
        print("no!")
    return sentences

print("Tokenizing ...")
hansardSpeeches["sentences_tokenized"] = hansardSpeeches["speech"].apply(speech_tokenizer)
print("Tokenization Complete")

'''
def day_removal(df,days_removed):
    df = df.drop(df[df["date"]].sample(n=df["date"].nunique()-days_removed).index)
    return df

hansardSpeeches["sentences"]= pd.Series.tolist(hansardSpeeches["speech"])

sentences = []
for i in range(0,len(questions)):

    start_time = time.time()

    try:
        # Need to first change "./." to "." so that sentences parse correctly
        hansard = questions[i].replace("/.", '')
        # Now apply functions
        sentences += hansard_to_sentences(hansard, tokenizer)
    except:
        print('no!')


print("There are " + str(len(sentences)) + " sentences in our corpus of questions.")
'''

In [None]:
"""
## Get total length of sentences in corpus
def get_total_sentence_length(sentence):
    n = len(sentence)
    return n

hansardSpeeches["sentences_count"] = hansardSpeeches["sentences_tokenized"].apply(get_total_sentence_length)

print("There are",hansardSpeeches["sentences_count"].sum(),"sentences in our corpus of questions.")
#print(hansardSpeeches["sentences_tokenized"].head())
#print(hansardSpeeches["sentences_tokenized"][1])

#print([sentence for sublist in hansardSpeeches["sentences_tokenized"].tolist() for sentence in sublist][2])
"""

In [None]:
def day_removal(hansardSpeeches,days_removed,Seed):
    # Set random seed for day removal
    random.seed(Seed)
    
    unique_days_left = hansardSpeeches["date"].unique().tolist()
    to_be_removed = random.sample(unique_days_left,days_removed)
    #print(to_be_removed)
    
    ## Keep rows if date value is not in the to_be_removed list
    hansardSpeeches = hansardSpeeches[~hansardSpeeches["date"].isin(to_be_removed)]
    print("Number of days in corpus:",hansardSpeeches["date"].nunique())
    
    return hansardSpeeches


num_features = 300    # Word vector dimensionality
min_word_count = 10   # Minimum word count 
num_workers = 4       # Number of threads to run in parallel
context = 6           # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

fraction_removed = 0.05 # (Wong) Adjust as needed - Fraction of days that are removed at each training instance

total_days = hansardSpeeches["date"].nunique()
print("There are",total_days,"days in corpus")
days_removed = int((total_days)*(fraction_removed))
print("Amount of days to be discarded at each model iteration: ",days_removed)
print("")

## Iterate until there are insufficient days left
i = 0
while fraction_removed*i < 1:
    if i == 0:  
        
        ## sentences is now a list of sentences formatted correctly for word2vec
        sentences = [sentence for sublist in hansardSpeeches["sentences_tokenized"].tolist() for sentence in sublist]
        print("Current population size =",len(sentences))
        
        print("currently processing: training model")
        start_time = time.time()

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
            level=logging.INFO)

        model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)

        model.init_sims(replace=True)

        model_name = 'hansardQuestions_removed_0.00.model'
        model.save(model_name)
        model = gensim.models.Word2Vec.load(model_name)

        vocab = list(model.wv.vocab.keys())


        print("Process complete--the first 25 words in the vocabulary are:")

        print(vocab[:25])
        print("")

        i += 1

    else:
        
        hansardSpeeches = day_removal(hansardSpeeches, days_removed, 42)
        
        ## sentences is now a list of sentences formatted correctly for word2vec
        sentences = [sentence for sublist in hansardSpeeches["sentences_tokenized"].tolist() for sentence in sublist]
        print("Current population size =",len(sentences))
        
        print("currently processing: training model, removing",
              "{0:.2f}".format(fraction_removed*i),
              "of samples")

        start_time = time.time()

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
            level=logging.INFO)

        model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)

        model.init_sims(replace=True)

        model_name = 'hansardQuestions_removed_'+str("{0:.2f}".format(fraction_removed*i)+'.model')
        model.save(model_name)
        model = gensim.models.Word2Vec.load(model_name)

        vocab = list(model.wv.vocab.keys())


        print("Process complete--the first 25 words in the vocabulary are:")

        print(vocab[:25])
        print("")

        i += 1