The objective of this notebook is to gather all preprocessing techniques in the same order cited in the paper, except the negation and spelling checker since we didn't neither find the words frequencies, nor the antonym file.

In [None]:
from nltk.corpus import stopwords
import nltk
import re
from gensim.models import Word2Vec
import pandas as pd 
import numpy as np
from gensim.models import Word2Vec, KeyedVectors   
import gc
from joblib import Parallel, delayed
import multiprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
path = "../data/articles1.csv"
News = pd.read_csv(path)
News.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


# Punctuation

In [None]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

News['content'] = News['content'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [None]:
text= News['content']
X=text.values.tolist()

# Basic

In [None]:


def majid(X):
    corpus = []
    for i in range(0, len(X)):
        #review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(X[i])) #remove punctuation
        review = re.sub(r'\d+','', str(X[i]))# remove number
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub(r'<[^>]+>','',review) #remove Html tags
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        corpus.append(review)        
#    return corpus        
    #Tokenizing and Word Count  
    words=[]
    for i in range(len(corpus)):
        words= nltk.word_tokenize(corpus[i])
    return words

X = [[el] for el in X] 


In [None]:
num_cores = multiprocessing.cpu_count()
sentences = Parallel(n_jobs=num_cores)(delayed(majid)(i) for i in X)

# Pos tagger

In [None]:
def pos_tagger(sentences):
    tags = [] #have the pos tag included
    nava_sen = []
    pt = nltk.pos_tag(sentences)
    nava = []
    nava_words = []
    for t in pt:
        if t[1].startswith('NN') or t[1].startswith('NNS') or t[1].startswith('NNP') or t[1].startswith('NNPS') or t[1].startswith('JJ') or t[1].startswith('JJR') or t[1].startswith('JJS') or  t[1].startswith('VB') or t[1].startswith('VBG') or t[1].startswith('VBN') or t[1].startswith('VBP') or t[1].startswith('VBZ') or t[1].startswith('RB') or t[1].startswith('RBR') or t[1].startswith('RBS'):
            nava.append(t)
            nava_words.append(t[0])
    return nava_words

def majid2(X):
    review = pos_tagger(X)
    gc.collect()
    return review

In [None]:

num_cores = multiprocessing.cpu_count()
sent_pos_tag = Parallel(n_jobs=num_cores)(delayed(majid2)(i) for i in sentences)

# Removing Stop Words

In [None]:
def remove_stopwords(sentences):
        stopwords_list = nltk.corpus.stopwords.words('english')
        clean_words = [word for word in sentences if (word not in stopwords_list)] 
        return clean_words 

def majid2(X):
    sentences = remove_stopwords(X)
    gc.collect()
    return sentences

In [None]:

num_cores = multiprocessing.cpu_count()
sent_stop_word = Parallel(n_jobs=num_cores)(delayed(majid2)(i) for i in sent_pos_tag)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Stemming

In [None]:
def stemming2(sentences):
        sno = nltk.stem.SnowballStemmer('english')
        stemmed_words = [sno.stem(word) for word in sentences]
        return stemmed_words

def majid2(X):
    sentences = stemming2(X)
    gc.collect()
    return sentences

In [None]:
num_cores = multiprocessing.cpu_count()
sent2 = Parallel(n_jobs=num_cores)(delayed(majid2)(i) for i in sent_stop_word)

# CBOW and Skip Gram embeddings

In [None]:
model1 = Word2Vec(sent2, min_count=3,size= 300,workers=multiprocessing.cpu_count(), window =1, sg = 0)
print('Done Training')

SizeOfVocab = model1.wv.vocab
print('Size of Vocabulary=',len(SizeOfVocab))
print('Done making the Vocabulary')

#####
model2 = Word2Vec(sent2, min_count=3,size= 300,workers=multiprocessing.cpu_count(), window =1, sg = 1)
print('Done Training')

SizeOfVocab = model2.wv.vocab
print('Size of Vocabulary=',len(SizeOfVocab))
print('Done making the Vocabulary')

Done Training
Size of Vocabulary= 59102
Done making the Vocabulary
Done Training
Size of Vocabulary= 59102
Done making the Vocabulary


# Save Results

In [None]:
path = "../Embeddings/"
model1.wv.save_word2vec_format(path + 'W-CBOW-ALL.txt', binary=False)
model1.save('W-CBOW-ALL.bin')
print('Done Saving Model1')
#####
model2.wv.save_word2vec_format(path + 'W-Skip-ALL.txt', binary=False)
model2.save('W-Skip-ALL.bin')
print('Done Saving Model2')

#model.save('model2.bin')

print('Done Saving the Embeddings')

Done Saving Model1
Done Saving Model2
Done Saving the Embeddings
