In [None]:
!pip install datasets
!pip install gensim
!pip install nltk



In [None]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd

In [None]:
!git clone https://github.com/huggingface/datasets.git


In [None]:
!python -c "from datasets import load_dataset; print(load_dataset('squad', split='train')[0])"


Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/0fd9e01360d229a22adfe0ab7e2dd2adc6e2b3d6d3db03636a51235947d4c6e9)
{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'id': '5733be284776f41900661182', 'question': 'To whom did the Vir

In [None]:
!python -c "from datasets import load_dataset; dataset = load_dataset('climate_fever')" 

Using custom data configuration default
Reusing dataset climate_fever (/root/.cache/huggingface/datasets/climate_fever/default/1.0.1/3b846b20d7a37bc0019b0f0dcbde5bf2d0f94f6874f7e4c398c579f332c4262c)


**Loading Dataset**

In [None]:
from datasets import load_dataset
dataset = load_dataset('climate_fever')


Using custom data configuration default
Reusing dataset climate_fever (/root/.cache/huggingface/datasets/climate_fever/default/1.0.1/3b846b20d7a37bc0019b0f0dcbde5bf2d0f94f6874f7e4c398c579f332c4262c)


**Building Corpus**

In [None]:
import re
dataset_features = dataset['test']
claim_evidences = []

# for x in dataset_features:
#     sclaim_evidences.append(x['claim'])
#     dataset_evidences = dataset_features['evidences']
#     for y in dataset_evidences: 
#       for z in y:        
#         claim_evidences.append(z['evidence'])
# print(claim_evidences)

claims = []
for x in dataset_features:
  x_claim = re.sub("[^a-zA-Z]", " ",str(x['claim']))
  #remove extra characters
  x_claim = re.sub(r"[[0-9]*\]", " ", x_claim)
  #remove the extra spaces between words
  x_claim = re.sub(r"\s+", " ", x_claim)
  #convert all letters to lowercase
  x_claim = x_claim.lower()
  claims.append(x_claim)


**Preprocessing: removing stopwords and tokenizing**

In [None]:
from nltk.corpus import stopwords
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')

#tokenize list of sentences to list of words
corpus = [nltk.word_tokenize(claim) for claim in claims]
#define the english stopwords
stop_words = stopwords.words('english')
#remove the stop words from the test
for i, _ in enumerate(corpus):
  corpus[i] = [word for word in corpus[i] if word not in stop_words]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Embedding the entire dataset**

In [None]:
!pip install gensim
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd

**Splitting the data**

In [None]:
from sklearn.model_selection import train_test_split

corpus_train, corpus_test = train_test_split(corpus, test_size = 0.2, random_state = 0)


**CM 1 - Part 1 -- Word2Vec embedding on entire corpus and train**

Why the Word2Vec parameters:

Generally we can't determine the hyperparameters of a word2vec model since it is automatically learnt by the model in training.

What we have done below is select and alter some parameters with some range of flexibility.

For this model, we used the default model type of **CBOW** because we wanted the surrounding words to predict the word they flank and because our dataset of words is relatively small.

We also picked a **minimum count of 1** because of the small dataset, we wanted as much of the words to be included.

We selected the **window of words to be 7** because a cursory look of the sentences in the data showed that each sentence averaged 7 words.

We picked a **dimensionality of 50 words** mostly because tests with a dimensionality of 100 and above didn't show significant differences in the vector models of the words possibly due to the size of the dataset and its general theme.

Increasing the number of epochs  benefits the quality of the word representations. Therefore, we set the iter = 100

In [None]:
#wv_model = Word2Vec(corpus, min_count=1, window=5, size=300, sample=0, sg=0, alpha=0.03, min_alpha=0.0007, negative=5) 
wv_model = Word2Vec(corpus, min_count=1, window=7, size=50, iter=100, workers = 20) 
#wv_model_train = Word2Vec(corpus, min_count=1, size=300, alpha=0.03, min_alpha=0.0007, negative=5)


Increasing the number of epochs benefits the quality of the word representations. Therefore, we set the iter = 100

**CM 1 - Part 2 -- Comparing the cosine similarity of a couple words**

From the brief comparisons below based on the context of the document, we can see that the words that in context share a lot of similarity and could be used around each other typically have a high cosine similarity.

The values below are arranged in order of decreasing similarity.

For example: 
1. polar and bears (0.81767243) -> have an expected similarity because both words are typically found in the same context.
2. climate and temparature (-0.25) -> This 02 words were probably not close to each other within sentences. Even though they have contextual similarity to some extent, the similarity score is low because of the dataset and training limitations.
---------------------------------------------------------------
Below, we take a look at the semantic relationship between a word and its most similar words based off this dataset::

- For 'Man': 'made', 'contributions', 'legislators','believe', 'impacting'
- For 'human': 'activities', 'primarily', 'largely', 'anthropogenic', 'civilization'
- For 'warming': conspiring, adapting, dimming, global

Considering the general context of this dataset, one can see why these words are the most similar to the selected words - they actually make contextual sense as the dataset details how **human** **activities** / **contributions** across the years have **caused** / effected a trend of climate **changes** / **warming** (**temperature**) of the planet.  


In [None]:
#1st set of words: climate and warming
print(wv_model.wv.similarity('climate', 'warning'))

#2nd set of words: polar and bears
print(wv_model.wv.similarity('polar', 'bears'))

#3rd set of words: climate and temperature
print(wv_model.wv.similarity('climate', 'temperature'))

#4th set of words: hot and ice
print(wv_model.wv.similarity('hot', 'ice'))

#5th set of words: temperature and ice
print(wv_model.wv.similarity('temperature', 'ice'))

#6th set of words: earthquakes and extinction
print(wv_model.wv.similarity('earthquakes', 'extinction'))

#7th set of words: windmill and roman
print(wv_model.wv.similarity('windmill', 'roman'))

#8th set of words: residency and atmospheric
print(wv_model.wv.similarity('residency', 'atmospheric'))


# -----------------------------------------------
#List of the most similar words to random words in data set

#1st word : man
print(wv_model.wv.most_similar('man'))
#2nd word: human
print(wv_model.wv.most_similar('human'))
#3rd word: warming
print(wv_model.wv.most_similar('warming'))

0.4332425
0.81767243
0.1554002
-0.22689188
0.07111336
0.3665973
0.41370222
0.59930485
[('made', 0.9193103909492493), ('contributions', 0.7849156856536865), ('legislators', 0.7373270392417908), ('prosperity', 0.730117678642273), ('illegal', 0.7162542939186096), ('aware', 0.7099815011024475), ('minimal', 0.708002507686615), ('believe', 0.701858401298523), ('courts', 0.6997411251068115), ('highlight', 0.691326916217804)]
[('activities', 0.7813305854797363), ('primarily', 0.7728374600410461), ('largely', 0.7700099349021912), ('anthropogenic', 0.7380788922309875), ('aerosol', 0.7208808064460754), ('mainly', 0.7004191875457764), ('dominant', 0.6888383626937866), ('emissions', 0.6688092947006226), ('fingerprints', 0.6635338068008423), ('discredits', 0.6600527167320251)]
[('dimming', 0.5790356397628784), ('cooling', 0.5767964124679565), ('global', 0.5641350746154785), ('post', 0.5369208455085754), ('adapting', 0.5351721048355103), ('sulfate', 0.5264771580696106), ('observed', 0.521841704845428

**CM 1 - Part 3 -- Analyze the quality of embeddings**

All of the arithmetic computations below except for the 1st and 3rd computation reveal relatively significant relationship between vectors on either side of the equation implying some correlation between semantic deduction and the vector embeddings. 

The 2nd computation "carbon + oxygen =  co" shows signficant relationship. This could be due to the high occurence of the words in the dataset and also the simplicity of the words.

The 5th computation "ice + melts =  sea + rises " shows signficant relationship. Once again, this could be due to the high occurence of the words in the dataset and also the simplicity of the words.

The 3rd computation "raining + increased =  flood " shows less signficant relationship. This could be due to the low occurence of the words in the dataset and also the increased complexity of the words.

We suspect that these results has a lot to do with the small size of the dataset since the smaller range of words and context will inevitably mean that a lot of the words will hold some level of similarity irrespective of their contextual meanings. Nonetheless as shown below, the model can be quite useful.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from numpy import linalg, mat, dot

#Arithmetic Computations

#1st set
# temperatures + increase =  warmer 
temp_increase = mat(wv_model.wv['temperatures'] + wv_model.wv['increase'])
warmer = mat(wv_model.wv['warmer'])

quality_1 = dot(temp_increase,warmer.T)/linalg.norm(temp_increase)/linalg.norm(warmer)
print(quality_1)


#2st set
# carbon + oxygen =  co
carbon_oxygen = mat(wv_model.wv['carbon'] + wv_model.wv['oxygen'])
co = mat(wv_model.wv['co'])

quality_2 = dot(carbon_oxygen,co.T)/linalg.norm(carbon_oxygen)/linalg.norm(co)
print(quality_2)

#3rd set
# raining + increased =  flood 
raining_increased = mat(wv_model.wv['raining'] + wv_model.wv['increased'])
flood = mat(wv_model.wv['flood'])

quality_3 = dot(raining_increased,flood.T)/linalg.norm(raining_increased)/linalg.norm(flood)
print(quality_3)

#4rd set
# burning + fossil + fuels =  carbon + dioxide 
b_fossil_fuel = mat(wv_model.wv['burning'] + wv_model.wv['fossil']  + wv_model.wv['fuels'])
carbon_doxide = mat(wv_model.wv['carbon'] + wv_model.wv['dioxide'])

quality_4 = dot(b_fossil_fuel,carbon_doxide.T)/linalg.norm(b_fossil_fuel)/linalg.norm(carbon_doxide)
print(quality_4)

#5th set
# ice + melts =  sea + rises 
ice_melts = mat(wv_model.wv['ice'] + wv_model.wv['melts'])
sea_rises = mat(wv_model.wv['sea'] + wv_model.wv['rises'])

quality_5 = dot(ice_melts,sea_rises.T)/linalg.norm(ice_melts)/linalg.norm(sea_rises)
print(quality_5)



[[0.3664157]]
[[0.56710255]]
[[0.25954226]]
[[0.51425445]]
[[0.6261726]]


**CM 1 - Part 4 -- Loading two pretrained models - GloVec vs Word2Vec**

GloVec model: glove-wiki-gigaword-300

Word2Vec model: word2vec-wiki-gigaword-300

GloVec.               Word2Vec

[[0.56682545]] -> [[0.53775185]]

[[0.13740683]] -> [[0.10254703]]

[[0.20376654]] -> [[0.21976838]]

[[0.5637933]] -> [[0.46399045]]

[[0.34571153]] -> [[0.3586582]]


On the left side, we have the GloVec model results and the Word2Vec model results on the right.

The 1st computation has higher score in pre-trained models compared to our trained model. This actaully makes sense because "temperatures + increase =  warmer" has clear sematic relationship. The pretrained models were able to deduce the relationship because of the vast training dataset. 

The 2nd computation has lower score in pre-trained models compared to our trained model. "carbon + oxygen =  co" words occur relatively frequently in our very limited dataset and the simplicity of the word "co" allows our trained model to deduce a relationship. However, in real world or in larger diverse datasets "carbon + oxygen =  co" may not have significant and accurate relationship; the words might not occur close to each other frequently and their infrequent occureneces might be negligible in vast datasets. Therefore, pretrained models deduce this relationship to be insignificant. 

The 4th computation has good score in pre-trained models compared to our trained model. This actaully makes sense because "burning + fossil + fuels =  carbon + dioxide" words in a single sentences are widely available in wikipidea and google news. The pretrained models were able to deduce the relationship because of the vast training dataset. Similarly, our trained model also deduced that this relationship is significant because these words freuqeunly appeared in sentences in our training data. 

As was expected the arithmetic computation makes more sense in pre-trained model compared to the model that we trained on the **climate_fever** corpus. This is mainly because the climate fever dataset is small, hence a lot of context has been lost.

An interesting observation of the above comparison is the similarity between the GloVec similarites and WordVec. They even share same directions for the 3rd set of words and we think that can be attributed to how generic both models are. This is mainly because of vast and diverse dataset they are trained on. 


In [None]:
import gensim.downloader as api

info = api.info()
model_gv_load = api.load("glove-wiki-gigaword-50") 

#1st set
# temperatures + increase =  warmer 
temp_increase = mat(model_gv_load['temperatures'] + model_gv_load['increase'])
warmer = mat(model_gv_load['warmer'])

quality_gv_1 = dot(temp_increase,warmer.T)/linalg.norm(temp_increase)/linalg.norm(warmer)
print(quality_gv_1)

#2st set
# carbon + oxygen =  co
carbon_oxygen = mat(model_gv_load['carbon'] + model_gv_load['oxygen'])
co = mat(model_gv_load['co'])

quality_gv_2 = dot(carbon_oxygen,co.T)/linalg.norm(carbon_oxygen)/linalg.norm(co)
print(quality_gv_2)

#3rd set
# raining + increased =  flood 
raining_increased = mat(model_gv_load['raining'] + model_gv_load['increased'])
flood = mat(model_gv_load['flood'])

quality_gv_3 = dot(raining_increased,flood.T)/linalg.norm(raining_increased)/linalg.norm(flood)
print(quality_gv_3)

#4rd set
# burning + fossil + fuels =  carbon + dioxide 
b_fossil_fuel = mat(model_gv_load['burning'] + model_gv_load['fossil']  + model_gv_load['fuels'])
carbon_doxide = mat(model_gv_load['carbon'] + model_gv_load['dioxide'])

quality_gv_4 = dot(b_fossil_fuel,carbon_doxide.T)/linalg.norm(b_fossil_fuel)/linalg.norm(carbon_doxide)
print(quality_gv_4)

#5th set
# ice + melts =  sea + rises 
ice_melts = mat(model_gv_load['ice'] + model_gv_load['melts'])
sea_rises = mat(model_gv_load['sea'] + model_gv_load['rises'])

quality_gv_5 = dot(ice_melts,sea_rises.T)/linalg.norm(ice_melts)/linalg.norm(sea_rises)
print(quality_gv_5)

#6th set
# King - man + woman = queen
king_man_woman = mat(model_gv_load['king'] + model_gv_load['woman'] + model_gv_load['man'])
queen = mat(model_gv_load['queen'])

quality_gv_6 = dot(king_man_woman,queen.T)/linalg.norm(king_man_woman)/linalg.norm(queen)
print(quality_gv_6)


[[0.56682545]]
[[0.13740683]]
[[0.20376654]]
[[0.5637933]]
[[0.34571153]]
[[0.5455089]]


In [None]:
info = api.info() 
model_wv_loaded = api.load("word2vec-google-news-300") 

# Training pretrained models on corpus dataset
# model_wv_loaded.train(corpus)

#1st set
# temperatures + increase =  warmer 
temp_increase = mat(model_wv_loaded['temperatures'] + model_wv_loaded['increase'])
warmer = mat(model_wv_loaded.wv['warmer'])

quality_w2v_1 = dot(temp_increase,warmer.T)/linalg.norm(temp_increase)/linalg.norm(warmer)
print(quality_w2v_1)


#2st set
# carbon + oxygen =  co
carbon_oxygen = mat(model_wv_loaded['carbon'] + model_wv_loaded['oxygen'])
co = mat(model_wv_loaded.wv['co'])

quality_w2v_2 = dot(carbon_oxygen,co.T)/linalg.norm(carbon_oxygen)/linalg.norm(co)
print(quality_w2v_2)

#3rd set
# raining + increased =  flood 
raining_increased = mat(model_wv_loaded['raining'] + model_wv_loaded['increased'])
flood = mat(model_wv_loaded.wv['flood'])

quality_w2v_3 = dot(raining_increased,flood.T)/linalg.norm(raining_increased)/linalg.norm(flood)
print(quality_w2v_3)

#4rd set
# burning + fossil + fuels =  carbon + dioxide 
b_fossil_fuel = mat(model_wv_loaded['burning'] + model_wv_loaded['fossil']  + model_wv_loaded['fuels'])
carbon_doxide = mat(model_wv_loaded['carbon'] + model_wv_loaded['dioxide'])

quality_w2v_4 = dot(b_fossil_fuel,carbon_doxide.T)/linalg.norm(b_fossil_fuel)/linalg.norm(carbon_doxide)
print(quality_w2v_4)

#5th set
# ice + melts =  sea + rises 
ice_melts = mat(model_wv_loaded['ice'] + model_wv_loaded['melts'])
sea_rises = mat(model_wv_loaded['sea'] + model_wv_loaded['rises'])

quality_w2v_5 = dot(ice_melts,sea_rises.T)/linalg.norm(ice_melts)/linalg.norm(sea_rises)
print(quality_w2v_5)

#6th set
# King - man + woman = queen
king_man_woman = mat(model_wv_loaded['king'] + model_wv_loaded['woman'] + model_wv_loaded['man'])
queen = mat(model_wv_loaded['queen'])

quality_gv_6 = dot(king_man_woman,queen.T)/linalg.norm(king_man_woman)/linalg.norm(queen)
print(quality_gv_6)


[[0.53775185]]
[[0.10254703]]
[[0.21976836]]
[[0.46399045]]
[[0.3586582]]
[[0.5243245]]


  # Remove the CWD from sys.path while we load stuff.


Note: We encountered some issues training the pre-trained model on the climate dataset but we were not bothered about this since the climate
dataset is very small compared to that of the pre-trained model so changes to similarity between words if any would be minimal.