In [94]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [95]:
nltk.download("stopwords")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [96]:
english_stopset = set(stopwords.words('english')).union(
                  {"things", "that's", "something", "take", "don't", "may", "want", "you're",
                   "set", "might", "says", "including", "lot", "much", "said", "know",
                   "good", "step", "often", "going", "thing", "things", "think",
                   "back", "actually", "better", "look", "find", "right", "example",
                                                                  "verb", "verbs"})


In [97]:
docs = ['i loved you ethiopian, stored elements in Compress find Sparse Ethiopia is the greatest country in the world of nation at universe',

        'also, sometimes, the same words can have multiple different ‘lemma’s. So, based on the context it’s used, you should identify the \
        part-of-speech (POS) tag for the word in that specific context and extract the appropriate lemma. Examples of implementing this comes \
        in the following sections countries.ethiopia With a planned.The name that the Blue Nile river loved took in Ethiopia is derived from the \
        Geez word for great to imply its being the river of rivers The word Abay still exists in ethiopia major languages',

        'With more than  million people, ethiopia is the second most populous nation in Africa after Nigeria, and the fastest growing \
         economy in the region. However, it is also one of the poorest, with a per capita income',

        'The primary purpose of the dam ethiopia is electricity production to relieve Ethiopia’s acute energy shortage and for electricity export to neighboring\
         countries.ethiopia With a planned.',

        'The name that the Blue Nile river loved takes in Ethiopia "abay" is derived from the Geez blue loved word for great to imply its being the river of rivers The \
         word Abay still exists in Ethiopia major languages to refer to anything or anyone considered to be superior.',

        'Two non-upgraded loved turbine-generators with MW each are the first loveto go into operation with loved MW delivered to the national power grid. This early power\
         generation will start well before the completion']

title = ['Two upgraded', 'Loved Turbine-Generators', 'Operation With Loved', 'National', 'Power Grid', 'Generator']

keywords = ['two','non','loved','ethiopia','operation','grid','power','fight','survive']  #we can generate keywords from articls using 'spacy'

In [98]:
documents_clean = []
documents_cleant = []

In [99]:
for d in docs:
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)  #Replace non-ASCII characters with space
    document_test = re.sub(r'@\w+', '', document_test)  #eliminate duplicate whitespaces/ # Remove Mentions
    document_test = document_test.lower() #converting to lower
    document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test) #cleaning punctuation
    document_test = re.sub(r'[0-9]', '', document_test) #replacing number with empity string
    document_test = re.sub(r'\s{2,}', ' ', document_test)  # Remove the doubled space
    documents_clean.append(document_test)
    documents_cleant.append(document_test)

lemmer=WordNetLemmatizer()
new_docs=[' '.join([lemmer.lemmatize(docs) for docs in text.split(',')]) for text in docs]  #Lemmatization the words/description
titles = [' '.join([lemmer.lemmatize(title).strip() for title in text.split(' ')]) for text in title]   #Lemmatization the title

#better than https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [100]:
english_stopset = list(stopwords.words('english'))

In [101]:
vectorizer = TfidfVectorizer(analyzer='word',
                              ngram_range=(1, 2),
                              min_df=0.002,
                              max_df=0.99,
                              max_features=10000,
                              lowercase=True,
                              stop_words=english_stopset)

In [102]:
X = vectorizer.fit_transform(new_docs)

In [103]:
# Create a DataFrame
df = pd.DataFrame(X.T.toarray())
print(df.head(10))
print(df.shape)

     0         1         2         3         4    5
0  0.0  0.085345  0.000000  0.000000  0.233406  0.0
1  0.0  0.000000  0.000000  0.000000  0.142318  0.0
2  0.0  0.085345  0.000000  0.000000  0.116703  0.0
3  0.0  0.000000  0.000000  0.173941  0.000000  0.0
4  0.0  0.000000  0.000000  0.173941  0.000000  0.0
5  0.0  0.000000  0.167583  0.000000  0.000000  0.0
6  0.0  0.000000  0.167583  0.000000  0.000000  0.0
7  0.0  0.085345  0.137421  0.000000  0.000000  0.0
8  0.0  0.000000  0.167583  0.000000  0.000000  0.0
9  0.0  0.104077  0.000000  0.000000  0.000000  0.0
(231, 6)


In [107]:
def get_similar_articles(q,t, df):
  print("Done Searching. Full Result: \n")
  print("searched items : ", q)
  print("Article with the Highest Cosine Similarity Values: ")
  search_rank ={}
  top_results=5
  q = [q]
  t = [t]

  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  q_vect = vectorizer.transform(t).toarray().reshape(df.shape[0],)
  sim = {}
  titl = {}

  for i in range(len(new_docs)) and range(len(titles)):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)  #Calculate the similarity
    # Or we can use cosine)similarity library both are the same
    titl[i] = np.dot(df.loc[:, i].values, q_vect) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vect)

  sim_sorted = sorted(sim.items(),key=lambda x : x[1], reverse=True)[:min(len(sim), top_results)]
  sim_sortedt = sorted(titl.items(),key=lambda x : x[1], reverse=True)[:min(len(titl), top_results)]


  for i, v in sim_sorted and sim_sortedt:    # Print the articles and their similarity values
    if v != 0.0:
      print("Similaritas score: ", v)
      zip(titles, new_docs)
      print(titles[i])
      print(new_docs[i])
      print('\n')

lemma_ops = 'million people'
#q1 = 'electrical productions'
list1 = nltk.word_tokenize(lemma_ops)
q1 = ' '.join([lemmer.lemmatize(lemma_ops) for lemma_ops in list1])

get_similar_articles(q1,q1, df)
print('-'*100)

Done Searching. Full Result: 

searched items :  million people
Article with the Highest Cosine Similarity Values: 
Similaritas score:  0.2902625120156729
Operation With Loved
With more than  million people  ethiopia is the second most populous nation in Africa after Nigeria  and the fastest growing          economy in the region. However  it is also one of the poorest  with a per capita income


----------------------------------------------------------------------------------------------------


In [108]:
lemma_ops = 'loved'
#q1 = 'electrical productions'
list1 = nltk.word_tokenize(lemma_ops)
q1 = ' '.join([lemmer.lemmatize(lemma_ops) for lemma_ops in list1])

get_similar_articles(q1,q1, df)
print('-'*100)

Done Searching. Full Result: 

searched items :  loved
Article with the Highest Cosine Similarity Values: 
Similaritas score:  0.17053622712109406
Generator
Two non-upgraded loved turbine-generators with MW each are the first loveto go into operation with loved MW delivered to the national power grid. This early power         generation will start well before the completion


Similaritas score:  0.16886292272057762
Power Grid
The name that the Blue Nile river loved takes in Ethiopia "abay" is derived from the Geez blue loved word for great to imply its being the river of rivers The          word Abay still exists in Ethiopia major languages to refer to anything or anyone considered to be superior.


Similaritas score:  0.12293880581908309
Two upgraded
i loved you ethiopian  stored elements in Compress find Sparse Ethiopia is the greatest country in the world of nation at universe


Similaritas score:  0.06174486651111567
Loved Turbine-Generators
also  sometimes  the same words can have

In [109]:
lemma_ops = 'love'
#q1 = 'electrical productions'
list1 = nltk.word_tokenize(lemma_ops)
q1 = ' '.join([lemmer.lemmatize(lemma_ops) for lemma_ops in list1])

get_similar_articles(q1,q1, df)
print('-'*100)

Done Searching. Full Result: 

searched items :  love
Article with the Highest Cosine Similarity Values: 
----------------------------------------------------------------------------------------------------
