## **Custom Embeddings**

This notebook should be ideally run on google colab. 

For Google colab, 
1. Make sure you have added the shared folder "digital-forest". 
2. Mount the google drive onto the colab environment. 
    1. Go to the folder icon on the left
    2. Click on th folder icon with google drive icon.
    3. This should mount the drive.
    4. Now all files in your drive are directly accessible in your colab environment.

For running on local environment, 
1. Make sure to change the root path to the local directory.
2. If any errors make sure to double check the file directory.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 1. Load the data

**Note:-** Here the directory should match the directory from your google colab drive. 
To get this
1. Explore the folders in the files section
2. Right Click on the folder whose path you woukld like to import.
3. Click on Copy Path from the dropdown

In [None]:
mdpi_dir = '/content/drive/MyDrive/digital-forest/mdpi'
elsevier_dir = '/content/drive/MyDrive/digital-forest/elsevier'

In [None]:
# Get a list of all files in given directory
from os import walk
mdpi_filenames = next(walk(mdpi_dir), (None, None, []))[2]  # [] if no file
elsevier_filenames = next(walk(elsevier_dir), (None, None, []))[2]

In [None]:
mdpi_filenames[:3]

['10.3390_su12030932.html',
 '10.3390_su132111674.html',
 '10.3390_rs90201023.html']

In [None]:
elsevier_filenames[:3]

['10.1016_0034-4257(95)00228-6.xml',
 '10.1016_0168-1923(95)02268-6.xml',
 '10.1016_0034-4257(95)00235-9.xml']

## 2. Get text data from all files

In [None]:
import imp
from bs4 import BeautifulSoup
import re
import pandas as pd

def extract_text_from_html(mdpi_dir, mdpi_file_name):
    with open(mdpi_dir + '/' + mdpi_file_name, "r", encoding='utf-8') as f:
        html_file = f.read()
    soup = BeautifulSoup(html_file, 'html.parser')
    
    article = soup.find('article')
    text_list = article.find_all(text=True)
    article_text = " ".join(text_list)
    
    # Remove \n characters
    clean_text = article_text.replace('\n', ' ')
    # Remove special characters and numbers
    clean_text = re.sub('[^.,A-Za-z]+', ' ', clean_text)
    # Convert all text to lower
    clean_text = clean_text.lower()
    
    return clean_text

In [None]:
def extract_title_from_html(mdpi_dir, mdpi_file_name):
    with open(mdpi_dir + '/' + mdpi_file_name, "r", encoding='utf-8') as f:
        html_file = f.read()
    soup = BeautifulSoup(html_file, 'html.parser')
    title =  soup.find('h1')
    title_name = title.find(text=True)
    title_name = title_name.replace('\n', '')
       
    return title_name    

In [None]:
# Get all the text data from the articles - mdpi
rows = [[],[]]
mdpi_corpus = []
failed_files = []

for file_name in mdpi_filenames:
    # There might be possible exceptions from extracting text. 
    # This will catch the exceptions and we can analyze why it failed for some files
    try:
        title = extract_title_from_html(mdpi_dir, file_name)
        extracted_text = extract_text_from_html(mdpi_dir, file_name)
        rows = [title, extracted_text]
        mdpi_corpus.append(rows)
    except Exception as e:
        failed_files.append(file_name)
        print("Error while extracting text for {}".format(file_name), e)

Error while extracting text for 10.3390_rs90201023.html 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.3390_rs90201024.html 'NoneType' object has no attribute 'find_all'


In [None]:
from google.colab import files
dataFrame = pd.DataFrame(mdpi_corpus)
dataFrame.columns = ['Title', 'Content']
dataFrame.to_csv('Mdpi_text.csv')
files.download('Mdpi_text.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def extract_text_from_xml(elsevier_dir, elsevier_file_name):
    with open(elsevier_dir + '/' + elsevier_file_name, "r", encoding='utf-8') as f:
        xml_file = f.read()
    soup = BeautifulSoup(xml_file, 'xml')
    
    article = soup.find('article')
    text_list = article.find_all(text=True)
    article_text = " ".join(text_list)
        
    # Remove \n characters
    clean_text = article_text.replace('\n', ' ')
    # Remove special characters and numbers
    clean_text = re.sub('[^.,A-Za-z]+', ' ', clean_text)
    # Convert all text to lower
    clean_text = clean_text.lower()
    
    return clean_text

In [None]:
def extract_title_from_xml(elsevier_dir, elsevier_file_name):
    with open(elsevier_dir + '/' + elsevier_file_name, "r", encoding='utf-8') as f:
        xml_file = f.read()
    soup = BeautifulSoup(xml_file, 'xml')
    
    title =  soup.find('title')
    title_name = title.find_all(text=True)
    
    return title_name    

In [None]:
# Get all the text data from the articles - elsevier
rows = [[],[]]
elsevier_corpus = []
failed_files = []
for file_name in elsevier_filenames:
    # There might be possible exceptions from extracting text. 
    # This will catch the exceptions and we can analyze why it failed for some files
    try:
        title = extract_title_from_xml(elsevier_dir, file_name)
        extracted_text = extract_text_from_xml(elsevier_dir, file_name)
        rows = [title, extracted_text]
        elsevier_corpus.append(rows)
    except Exception as e:
        failed_files.append(file_name)
        print("Error while extracting text for {}".format(file_name), e)

Error while extracting text for 10.1016_0034-4257(95)00228-6.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_0168-1923(95)02268-6.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_0034-4257(95)00235-9.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_0034-4257(95)00230-8.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_j.agrformet.2017.01.020.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_j.agrformet.2014.09.010.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_j.agrformet.2015.09.003.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_j.agrformet.2009.04.008.xml 'NoneType' object has no attribute 'find_all'
Error while extracting text for 10.1016_j.agrformet.2016.08.001.xml 'NoneType' object has no attribu

In [None]:
from google.colab import files
dataFrame = pd.DataFrame(elsevier_corpus)
dataFrame.columns = ['Title', 'Content']
dataFrame.to_csv('Elsevier_text.csv')
files.download('Elsevier_text.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 3. Setup glove embeddings 

**This is required on google colab as data is not stored permenantly.**

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip "glove.6B.zip"

## 4. Setup pipeline to make custom embeddings

In [None]:
import gensim
from gensim.test.utils import get_tmpfile, datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import Word2Vec

In [None]:
glove_file = datapath('/content/glove.6B.50d.txt')
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)
glove_vectors = KeyedVectors.load_word2vec_format(tmp_file)

### 4.1 Process the corpus to the input format required by Word2Vec algorithm

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

In [None]:
# First we combine all the records into one single string
full_text = " ".join(mdpi_corpus)

In [None]:
sentences = []
for document in mdpi_corpus:
    # Break down each document in the corpus to list of sentences 
    sent_list = sent_tokenize(document)
    # For each sentence break it into list of words
    for sent in sent_list:
        word_list = word_tokenize(sent)
        sentences.append(word_tokenize(sent))

In [None]:
print("We have {} sentences in the corpus".format(len(sentences)))

### 4.2 Setup Word2Vec model

In [None]:
# build a word2vec model on your dataset
base_model = Word2Vec(size=50, window=5, min_count=3, workers=4)
base_model.build_vocab(sentences)

In [None]:
total_examples = base_model.corpus_count

In [None]:
# Unique words in the vocabulary
len(base_model.wv.vocab)

In [None]:
# Statistics of our vocabulary
unique_words = set(base_model.wv.vocab.keys()) - set(glove_vectors.vocab.keys())
common_words = set(base_model.wv.vocab.keys()).intersection(set(glove_vectors.vocab.keys()))

print("Unique words to our corpus {}".format(len(unique_words)))
print("Common words between corpus and glove {}".format(len(common_words)))

### 4.3 Train Word2Vec model

In [None]:
# update our model with GloVe's vocabulary & weights
base_model.build_vocab([list(glove_vectors.vocab.keys())], update=True)

In [None]:
# train on your data
base_model.train(sentences, total_examples=total_examples, epochs=100)
base_model_wv = base_model.wv

### 4.4 Analyze our embeddings

In [None]:
list(unique_words)[:10]

In [None]:
'geoinform' in common_words

In [None]:
base_model_wv.most_similar('geoinform')