In [1]:
# Import required dependencies
import pandas as pd
from io import BytesIO
import numpy as np
import os
import shutil
import pprint
import json
import matplotlib.image as mpimg
import tensorflow as tf
import pathlib
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer


In [2]:
!pip install textract

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl (23 kB)
Collecting beautifulsoup4~=4.8.0
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl (106 kB)
[K     |████████████████████████████████| 106 kB 9.4 MB/s 
[?25hCollecting docx2txt~=0.8
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
Collecting SpeechRecognition~=3.8.1
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 131 kB/s 
[?25hCollecting argcomplete~=1.10.0
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl (36 kB)
Collecting six~=1.12.0
  Downloading six-1.12.0-py2.py3-none-any.whl (10 kB)
Collecting pdfminer.six==20191110
  Downloading pdfminer.six-20191110-py2.py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 35.5 MB/s 
[?25hCollecting python-pptx~=0.6.18
  Downloading python-pptx-0.6.21.tar.gz (10.1 MB)
[K     |

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [4]:
data_path = '/content/gdrive/MyDrive/manning/questionandanswering'

In [5]:
data = pd.read_csv(os.path.join(data_path, 'data.zip'))

In [6]:
data["paragraphs"].describe()

count     12926
unique     6086
top            
freq        497
Name: paragraphs, dtype: object

In [7]:
data.dropna(inplace=True)

## Append questions to corpus

In [9]:
questions = [
"What fuel is used for the manufacturing of chlorine?",
"What metric is used for evaluating emission?",
"How can carbon emission of the processes of cement clinker be reduced?",
"How is the Weighted Cogeneration Threshold calculated?",
"What are carbon capture and sequestration?",
"What stages does CCS consist of?",
"What should be the average energy consumption of a water supply system?",
"What are sludge treatments? -What is the process of anaerobic digestion?",
"How is reforestation defined?",
"What is the threshold of emission for inland passenger water transport?",
"What are the requirements of reporting for electricity generation from natural gas where there might be fugitive emissions?"
             ]

In [11]:
data = data.append(pd.DataFrame(questions, columns=["paragraphs"]))

In [12]:
data =pd.DataFrame(data["paragraphs"])

In [13]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data["paragraphs"].to_numpy())

In [14]:
vectorizer.get_feature_names_out()

array(['00', '00295', '0045', ..., 'zurich', 'zwickel', 'μgnm3'],
      dtype=object)

In [15]:
tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
tfidf.shape

(12948, 7611)

In [17]:
from sklearn.metrics.pairwise import linear_kernel

In [18]:
cosine_sim = linear_kernel(tfidf, tfidf)


In [19]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.060402  ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.060402  , 1.        ,
        0.17135306],
       [0.        , 0.        , 0.        , ..., 0.        , 0.17135306,
        1.        ]])

In [20]:
cosine_sim.shape

(12948, 12948)

In [21]:
data = data.reset_index()

In [22]:
data.index[data["paragraphs"] == questions[7]].tolist()

[12933, 12944]

In [25]:
questions[7]

'What are sludge treatments? -What is the process of anaerobic digestion?'

In [51]:
data.iloc[12933]

index                                                         7
paragraphs    What are sludge treatments? -What is the proce...
Name: 12933, dtype: object

In [23]:
  sim_scores = list(enumerate(cosine_sim[12933]))
  # Sort the movies based on the similarity scores
      
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [28]:
data.iloc[[i[0] for i in sim_scores[2:11]]] #skip first as this is the question being asked

Unnamed: 0,index,paragraphs
5569,5569,Anaerobic digestion of sewage sludge
10404,10404,Anaerobic digestion of sewage sludge
5370,5370,What is covered
5558,5558,53 Anaerobic digestion of sewage sludge
10395,10395,53 Anaerobic digestion of sewage sludge
5580,5580,Anaerobic Digestion AD and in some cases aerob...
149,149,Anaerobic digestion of sewage sludge 298
293,293,Anaerobic digestion of sewage sludge 526
5573,5573,Anaerobic digestion of sewage sludge treatment...


## Doc2Vec library

In [42]:
from gensim.test.utils import common_texts
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4)

In [43]:
def getTaggedDocuments():
  for index, row in data.iterrows():
      tokens = gensim.utils.simple_preprocess(row['paragraphs'])            
      yield gensim.models.doc2vec.TaggedDocument(tokens, [index])

In [44]:
corpus = list(getTaggedDocuments())

In [46]:
model.build_vocab(corpus)

In [48]:
model.train(corpus, total_examples=model.corpus_count, epochs=80)

In [52]:
tokens = questions[7].split()

new_vector = model.infer_vector(tokens)

In [56]:
model.docvecs.most_similar([new_vector]) 

[(5580, 0.6463029980659485),
 (12944, 0.6081377267837524),
 (5671, 0.5736139416694641),
 (2602, 0.5709162950515747),
 (5731, 0.5708827972412109),
 (12933, 0.5385823845863342),
 (3362, 0.5305238962173462),
 (5737, 0.5265952348709106),
 (5270, 0.5187274217605591),
 (5584, 0.5145295858383179)]

In [58]:
list(data.iloc[5580])

[5580,
 'Anaerobic Digestion AD and in some cases aerobic digestion are examples of sludge treatments In AD microorganisms decompose the organic matter of the sludge in the absence of oxygen and produce methanerich biogas ']