In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from gensim.parsing.preprocessing import remove_stopwords
from nltk import sent_tokenize
from keybert import KeyBERT
from pprint import pprint
import pandas as pd
import spacy
import copy
import re

In [2]:
data = pd.read_csv('Index_words_eng.csv')
data.head()

Unnamed: 0,Index,Word
0,0,accessiblecomputing
1,1,anarchism
2,2,afghanistanhistory
3,3,afghanistangeography
4,4,afghanistanpeople


In [3]:
def append_corpus(index: list or tuple =[0,10], corpus:list = []) -> None:
    '''Чтение файлов с текстами собирая их в масси для последущих манипуляций'''
    index = list(data.Index)[index[0]: index[1]]
    for i in index:
        with open(f"dataset_eng/{i}.txt", 'r') as fl:
            text = fl.read().lower()
            text = re.sub(r'/(style=")([a-zA-Z0-9:;\.\s\(\)\-\,]*)(")/gi', '', text)
            text = re.sub(r'ref|url|link|title|aa|url', '', text)
            text = re.sub(r"title", '', text)
            text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»""‘’]))', '', text)
            text = re.sub(r"[^a-zA-Z\.]+", " ", text)
       
        corpus.append(remove_stopwords(text))

In [4]:
def key_words(corpus) -> list:
    """Выделение ключевых слов с использованием BERT"""
    res = []
    for text in corpus:
        try:
            model = KeyBERT('distilbert-base-nli-mean-tokens')
            keywords = model.extract_keywords(text, keyphrase_length=2, use_maxsum=True)
            [keywords.append(i) for i in model.extract_keywords(text, keyphrase_length=1, use_maxsum=True)];
        except Exception as e:
            #print(e)
            info = e
            keywords = ['None' ] * 10
        res.append(keywords)
    return res

In [5]:
def create_DataFrame(data:pd.DataFrame, result, index) -> pd.DataFrame:
    return pd.DataFrame({i:j for i,j in zip(list(data.loc[index[0]:index[1], 'Word']), result)})

In [6]:
corpus = []
index=[0, 50]
append_corpus(index=index, corpus=corpus)

In [7]:
corpus_with_key_words = key_words(corpus)

In [8]:
def n_gram_tfidf(corpus, res, n= 4) -> None:
    """ИСпользование n-Grams & TF-IDf"""
    data = []
    size = (1,1)
    for index_text, text in enumerate(corpus):
        text = re.sub(r"shell|redirect|category", ' ', text)
        data = []
        for i in range(2, n + 1):
            size = (i, i)
            vectorizer = CountVectorizer(ngram_range =size) 
            try:
                X1 = vectorizer.fit_transform(sent_tokenize(text))  
                features = (vectorizer.get_feature_names()) 

                # Applying TFIDF 
                # You can still get n-grams here 
                vectorizer = TfidfVectorizer(ngram_range = size) 
                X2 = vectorizer.fit_transform(corpus) 
                scores = (X2.toarray()) 

                # Getting top ranking features 
                sums = X2.sum(axis = 0) 

                for col, term in enumerate(features): 
                    data.append( (term, sums[0, col] )) 
            except Exception as e:
                info = e
                #print(e)
        ranking = pd.DataFrame(data, columns = ['term', 'rank']) 
        words = (ranking.sort_values('rank', ascending = False)) 
        [res[index_text].append(_) for _ in list(words.head().term.values)]

In [9]:
n_gram_tfidf(corpus=corpus, res = corpus_with_key_words)

In [10]:
example_output = copy.deepcopy(corpus_with_key_words)

In [11]:
def alignment_of_pillars(example_output) -> None:
    """Выравнивание всех колонок по длине для построение датафрейма"""
    for index, i in enumerate(example_output):
        if len(i) < max([len(i) for i in example_output]):
            while len(i)< max([len(i) for i in example_output]):
                example_output[index].append('None')

In [12]:
alignment_of_pillars(example_output=example_output)

In [13]:
create_DataFrame(data=data, result=example_output, index=[0,50])

Unnamed: 0,accessiblecomputing,anarchism,afghanistanhistory,afghanistangeography,afghanistanpeople,afghanistancommunications,afghanistantransportations,afghanistanmilitary,afghanistantransnationalissues,assistivetechnology,...,aynrand,alexanderthegreat,anchoragealaska,argumentforms,argumentsfortheexistenceofgod,anarchy,asciiart,academyawards,academyawards/bestpicture,austrialanguage
0,,sectarianism anarchist,category shell,category shell,category shell,category shell,category shell,category shell,category shell,assistive technology,...,category shell,great redirect,category shell,logical form,existence god,category shell,ascii art,category shell,camelcase subpage,category shell
1,,history libertarian,redirect category,redirect category,redirect category,redirect category,redirect category,armed forces,foreign relations,category shell,...,redirect category,alexander great,redirect category,category shell,redirect category,redirect category,redirect category,redirect academy,redirect academy,redirect category
2,,libertarian political,history afghanistan,geography afghanistan,redirect demographics,redirect communications,shell camelcase,redirect category,redirect category,technology redirect,...,ayn rand,category shell,anchorage alaska,form redirect,category shell,redirect anarchy,category shell,academy awards,shell camelcase,shell camelcase
3,,century anarchists,shell camelcase,shell camelcase,demographics afghanistan,shell camelcase,transport afghanistan,shell camelcase,relations afghanistan,redirect category,...,redirect ayn,redirect alexander,shell camelcase,redirect category,god redirect,anarchy redirect,art redirect,shell camelcase,best picture,german redirect
4,,economics anarchist,afghanistan redirect,afghanistan redirect,shell camelcase,communications afghanistan,camelcase unprintworthy,afghan armed,shell camelcase,shell camelcase,...,shell camelcase,shell camelcase,alaska redirect,shell camelcase,shell camelcase,shell camelcase,shell camelcase,awards redirect,academy award,austrian german
5,,liberalism,history,category,demographics,communications,category,armed,relations,assistive,...,category,great,category,logical,category,category,art,academy,subpage,shell
6,,capitalist,shell,shell,shell,shell,shell,shell,shell,technology,...,rand,shell,shell,category,god,shell,ascii,shell,academy,redirect
7,,fascists,redirect,redirect,redirect,redirect,afghanistan,redirect,redirect,shell,...,shell,alexander,redirect,shell,shell,anarchy,shell,redirect,best,camelcase
8,,counterculture,afghanistan,afghanistan,afghanistan,afghanistan,camelcase,afghan,afghanistan,redirect,...,redirect,redirect,alaska,redirect,redirect,redirect,redirect,awards,shell,german
9,,postcolonialism,camelcase,camelcase,camelcase,camelcase,unprintworthy,camelcase,camelcase,camelcase,...,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,austrian


In [14]:
def group_crossing(arg:list) -> list:
    """Группировка ключевых n-Gramm тайтла по общему слову"""
    tmp = []
    for i in arg:
        test = re.sub(r'shell|category|redirect', '', i)
        if len(test) > 0:
            if len(test.split() ) == 1:
                tmp.append(re.sub(r'\s+', '', test))
            else :
                tmp.append(test)
    tmp = list(set(tmp))
    result_new = []
    for index, el in enumerate(tmp):
        test = []
        for index_, el_ in enumerate(tmp):
            if el_ != 'None' and len(set(el.split()) & set(el_.split())) > 0 :
                test.append(el_)
                tmp[index_] = 'None'
        if len(test) > 0 :
            if len(test) > 1 :
                result_new.append(set(test))
            else:
                result_new.append(test[0])
    return result_new

In [15]:
for index, el  in enumerate(corpus_with_key_words):
    corpus_with_key_words[index] = group_crossing(el)

In [16]:
example_output = copy.deepcopy(corpus_with_key_words)

In [17]:
alignment_of_pillars(example_output=example_output)

In [18]:
create_DataFrame(data=data, result=example_output, index=[0, 50])

Unnamed: 0,accessiblecomputing,anarchism,afghanistanhistory,afghanistangeography,afghanistanpeople,afghanistancommunications,afghanistantransportations,afghanistanmilitary,afghanistantransnationalissues,assistivetechnology,...,aynrand,alexanderthegreat,anchoragealaska,argumentforms,argumentsfortheexistenceofgod,anarchy,asciiart,academyawards,academyawards/bestpicture,austrialanguage
0,"{camelcase unprintworthy, accessibility camelc...",fascists,"{history afghanistan, afghanistan, history, hi...","{afghanistan, afghanistan camelcase, geography...","{demographics afghanistan camelcase, demograph...","{communications, communications afghanistan ca...","{transport afghanistan, afghanistan, afghanist...","{afghan, afghan armed forces camelcase, afghan...","{foreign relations afghanistan camelcase, rela...","{assistive technology camelcase, technology ca...",...,"{ayn, ayn rand, ayn rand camelcase}","{alexander great camelcase, alexander great, c...","{anchorage alaska, alaska camelcase, anchorage...","{logical form camelcase, form camelcase, form,...","{existence god camelcase, god, camelcase, exis...","{camelcase, anarchy camelcase}","{ascii art, ascii art camelcase, ascii, art, a...","{academy awards, awards, awards camelcase, aca...","{academy award best, best, award best picture,...","{austrian german camelcase, camelcase, german ..."
1,,"{anarchist movement developed, anarchist criti...",camelcase,camelcase,,"{afghanistan, afghanistan camelcase}","{camelcase unprintworthy, unprintworthy, camel...","{armed forces camelcase, armed forces, armed}",,"{technology, assistive technology}",...,"{rand, rand camelcase, camelcase}",,camelcase,,,anarchy,camelcase,academy,"{subpage, camelcase subpage}",
2,,"{history libertarian, libertarian political}",,,,camelcase,,camelcase,,assistive,...,,,,,,,,camelcase,camelcase,
3,,postcolonialism,,,,,,,,,...,,,,,,,,,,
4,,counterculture,,,,,,,,,...,,,,,,,,,,
5,,liberalism,,,,,,,,,...,,,,,,,,,,
6,,michael editor stears,,,,,,,,,...,,,,,,,,,,
7,,long roderick,,,,,,,,,...,,,,,,,,,,
8,,century anarchists,,,,,,,,,...,,,,,,,,,,
9,,capitalist,,,,,,,,,...,,,,,,,,,,


In [19]:
nlp = spacy.load("en_core_web_sm")

In [20]:
def recognazie_spam(array_words:list) -> list:
    """Поиск неевалидной информации(которая не относится к терминам)"""
    result = []
    for  el in array_words:
        flag = True
        if type(el) == set:
            text = " ".join(el)
            doc = nlp(text)
            for ent in doc.ents:
                flag = False
                break
        if flag:
            result.append(el)
    return result

In [21]:
for index, el in enumerate(corpus_with_key_words):
    corpus_with_key_words[index] = recognazie_spam(el)

In [22]:
pprint(corpus_with_key_words)

[[{'accessibility camelcase',
   'accessibility camelcase unprintworthy',
   'camelcase unprintworthy'}],
 ['fascists',
  {'libertarian political', 'history libertarian'},
  'postcolonialism',
  'counterculture',
  'liberalism',
  'michael editor stears',
  'long roderick',
  'century anarchists',
  'capitalist'],
 ['camelcase'],
 ['camelcase'],
 [],
 ['camelcase'],
 [],
 ['camelcase'],
 [],
 [{'technology', 'assistive technology'}, 'assistive'],
 ['camelcase'],
 [{'autism',
   'autism legends',
   'genereviewsname autism',
   'interview autism',
   'pathophysiology autism',
   'pharmacotherapy autism'},
  'psychologist',
  'geneticists estimated cases',
  'googleusercontent',
  'diabetes',
  'wakefieldarticlebmj',
  'future research'],
 ['history'],
 ['camelcase'],
 [{'think camelcase', 'camelcase', 'think'}],
 ['camelcase', 'politics'],
 [],
 ['radiometry',
  'calculated given',
  'boreal forest journal journal',
  'solar irradiance',
  'harvard',
  'astronomy band',
  'photovoltaics

In [23]:
example_output = copy.deepcopy(corpus_with_key_words)

In [24]:
alignment_of_pillars(example_output=example_output)

In [25]:
create_DataFrame(data=data, result=example_output, index=[0, 50])

Unnamed: 0,accessiblecomputing,anarchism,afghanistanhistory,afghanistangeography,afghanistanpeople,afghanistancommunications,afghanistantransportations,afghanistanmilitary,afghanistantransnationalissues,assistivetechnology,...,aynrand,alexanderthegreat,anchoragealaska,argumentforms,argumentsfortheexistenceofgod,anarchy,asciiart,academyawards,academyawards/bestpicture,austrialanguage
0,"{camelcase unprintworthy, accessibility camelc...",fascists,camelcase,camelcase,,camelcase,,camelcase,,"{technology, assistive technology}",...,,,camelcase,"{logical form camelcase, form camelcase, form,...","{existence god camelcase, god, camelcase, exis...","{camelcase, anarchy camelcase}",camelcase,academy,"{subpage, camelcase subpage}",
1,,"{history libertarian, libertarian political}",,,,,,,,assistive,...,,,,,,anarchy,,camelcase,camelcase,
2,,postcolonialism,,,,,,,,,...,,,,,,,,,,
3,,counterculture,,,,,,,,,...,,,,,,,,,,
4,,liberalism,,,,,,,,,...,,,,,,,,,,
5,,michael editor stears,,,,,,,,,...,,,,,,,,,,
6,,long roderick,,,,,,,,,...,,,,,,,,,,
7,,century anarchists,,,,,,,,,...,,,,,,,,,,
8,,capitalist,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
