In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from gensim.parsing.preprocessing import remove_stopwords
from nltk import sent_tokenize
from keybert import KeyBERT
import pandas as pd
import numpy as np
import spacy
import time
import copy
import csv
import re

In [2]:
PATH = "Terms_Key_words.csv"
users = [
    ["terms", "key_words"],
]
with open(PATH, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(users)

In [3]:
nlp = spacy.load("en_core_web_sm")
data = pd.read_csv('Index_words_eng.csv')
data.head()

Unnamed: 0,Index,Word
0,0,accessiblecomputing
1,1,anarchism
2,2,afghanistanhistory
3,3,afghanistangeography
4,4,afghanistanpeople


In [4]:
INDEX = [0, 250]
n = list(data.Index)[INDEX[0] : INDEX[1]]

In [5]:
def append_corpus(index: list or tuple =[0,10], corpus:list = []) -> None:
    '''Чтение файлов с текстами собирая их в массив для последущих манипуляций'''
    index = list(data.Index)[index[0]: index[1]]
    for i in index:
        with open(f"dataset_eng/{i}.txt", 'r') as fl:
            text = fl.read().lower()
            text = re.sub(r'/(style=")([a-zA-Z0-9:;\.\s\(\)\-\,]*)(")/gi', '', text)
            text = re.sub(r'ref|url|link|title|aa|url', '', text)
            text = re.sub(r"title", '', text)
            text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»""‘’]))', '', text)
            text = re.sub(r"[^a-zA-Z\.]+", " ", text)
       
        corpus.append(remove_stopwords(text))

In [6]:
model = KeyBERT('distilbert-base-nli-mean-tokens')
def key_words(corpus) -> list:
    """Выделение ключевых слов с использованием BERT"""
    res = []
    for text in corpus:
        try:            
            n1 = model.extract_keywords(text, keyphrase_length=1, use_maxsum=True)
            n2 = model.extract_keywords(text, keyphrase_length=2, use_maxsum=True)
#             n3 = model.extract_keywords(text, keyphrase_length=3, use_maxsum=True)
#             n4 = model.extract_keywords(text, keyphrase_length=4, use_maxsum=True)
            keywords = n1 + n2 # + n3 + n4 
        except Exception as e:
            #print(e)
            info = e
            keywords = ['None' ]
        res.append(keywords)
    return res

In [7]:
def create_DataFrame(data:pd.DataFrame, result, index) -> pd.DataFrame:
    return pd.DataFrame({i:j for i,j in zip(list(data.loc[index[0]:index[1], 'Word']), result)})

In [8]:
def n_gram_tfidf(corpus, res, n= 4) -> None:
    """ИСпользование n-Grams & TF-IDf"""
    for index_text, text in enumerate(corpus):
        size = (1,1)
        text = re.sub(r"shell|redirect|category", ' ', text)
        data = []
        
        
        x2_variaons = {i : TfidfVectorizer(ngram_range = (i, i)).fit_transform(corpus)  for i in range(2, n+1)}
        
        
        for i in range(2, n + 1):
            size = (i, i)
            vectorizer = CountVectorizer(ngram_range =size) 
            try:
                X1 = vectorizer.fit_transform(sent_tokenize(text))  
                features = (vectorizer.get_feature_names()) 

                # Applying TFIDF 
                # You can still get n-grams here 
#                 vectorizer = TfidfVectorizer(ngram_range = size) 
#                 X2 = vectorizer.fit_transform(corpus)
                X2 = x2_variaons[i]
#                 scores = (X2.toarray()) 

                # Getting top ranking features 
                sums = X2.sum(axis = 0) 

                for col, term in enumerate(features): 
                    data.append( (term, sums[0, col] )) 
            except Exception as e:
                info = e
                #print(e)
        ranking = pd.DataFrame(data, columns = ['term', 'rank']) 
        words = (ranking.sort_values('rank', ascending = False)) 
        [res[index_text].append(_) for _ in list(words.head().term.values)]

In [9]:
def alignment_of_pillars(example_output) -> None:
    """Выравнивание всех колонок по длине для построение датафрейма"""
    for index, i in enumerate(example_output):
        if len(i) < max([len(i) for i in example_output]):
            while len(i)< max([len(i) for i in example_output]):
                example_output[index].append('None')

In [10]:
def group_crossing(arg:list) -> list:
    """Группировка ключевых n-Gramm тайтла по общему слову"""
    tmp = []
    for i in arg:
        test = re.sub(r'shell|category|redirect|camelcase', '', i)
        if len(test) > 0:
            if len(test.split() ) == 1:
                tmp.append(re.sub(r'\s+', '', test))
            else :
                tmp.append(test)
    tmp = list(set(tmp))
    result_new = []
    for index, el in enumerate(tmp):
        test = []
        for index_, el_ in enumerate(tmp):
            if el_ != 'None' and len(set(el.split()) & set(el_.split())) > 0 :
                test.append(el_)
                tmp[index_] = 'None'
        if len(test) > 0 :
            if len(test) > 1 :
                result_new.append(set(test))
            else:
                result_new.append(test[0])
    return result_new

In [11]:
def recognazie_spam(array_words:list) -> list:
    """Поиск неевалидной информации(которая не относится к терминам)"""
    result = []
    for  el in array_words:
        flag = True
        if type(el) == set:
            text = " ".join(el)
            doc = nlp(text)
            for ent in doc.ents:
                flag = False
                break
        if flag:
            result.append(el)
    return result

In [12]:
from_ = 0
step = 100
result_corpus = []
for to in np.arange(step, len(n)+ 1, step):
    start_time = time.time()
    
    corpus = []
    index= [from_, to]
    
    append_corpus(index=index, corpus=corpus)
    
    print("--- %s append_corpus ---" % (time.time() - start_time))
    start_time = time.time()

    
    corpus_with_key_words = key_words(corpus)
    
    print("--- %s corpus_with_key_words ---" % (time.time() - start_time))
    start_time = time.time()
    
    
    n_gram_tfidf(corpus=corpus, res = corpus_with_key_words)
    
    print("--- %s n_gram_tfidf ---" % (time.time() - start_time))
    start_time = time.time()
    
    for el  in corpus_with_key_words:
        result_corpus.append(recognazie_spam(group_crossing(el)))
     
    print("--- %s recognazie_spam ---" % (time.time() - start_time))

    
    print(from_, to)
    from_= to

--- 0.3257277011871338 append_corpus ---
--- 148.37134194374084 corpus_with_key_words ---
--- 141.02721977233887 n_gram_tfidf ---
--- 0.7800981998443604 recognazie_spam ---
0 100
--- 0.9061279296875 append_corpus ---
--- 334.41275119781494 corpus_with_key_words ---
--- 330.59163999557495 n_gram_tfidf ---
--- 1.28175950050354 recognazie_spam ---
100 200


In [13]:
example_output = copy.deepcopy(result_corpus)

In [14]:
alignment_of_pillars(example_output=example_output)

In [15]:
data = create_DataFrame(data=data, result=example_output, index=[INDEX[0], INDEX[1]])

In [16]:
data.head()

Unnamed: 0,accessiblecomputing,anarchism,afghanistanhistory,afghanistangeography,afghanistanpeople,afghanistancommunications,afghanistantransportations,afghanistanmilitary,afghanistantransnationalissues,assistivetechnology,...,amateur astronomy,astronomers and astrophysicists,aikido,art,albania/history,albania/transnational issues,albania/people,albania/foreign relations,agnostida,abortion
0,"{accessibility, accessibility unprintworthy}",liberalism,,,afghanistan,,,afghan,,"{assistive technology , assistive, technology,...",...,solar,,major styles,july,history,"{subpage, subpage nahmc foreign, subpage nahmc...",albania,"{subpage, subpage nahmc foreign, subpage nahmc...",glyptagnostidae,abortionist midwife
1,unprintworthy,counterculture,,,,,,,,,...,ra declination dec,,little caricatures,"{cancer, cancer accessdate}",,relations,,relations,taxobox,condoms reported
2,,capitalist,,,,,,,,,...,"{skygazing, skygazing skygazing, astronomy sky...",,judo students,decisive role,,,,,author valent volume,cancer
3,,rough consensus,,,,,,,,,...,sungrazers,,aikidoka,birth venus,,,,,sam gon,"{outpatient abortion, abortion termination, pr..."
4,,pages date harv cite,,,,,,,,,...,asteroids,,ky jitai,visitors viewers,,,,,year agnostids entombed,volume page


In [17]:
result_corpus[1]

['liberalism',
 'counterculture',
 'capitalist',
 'rough consensus',
 'pages date harv cite',
 'century anarchists',
 'women sexual freedom',
 'involvement protests',
 {'economics anarchist', 'sectarianism anarchist'},
 'postcolonialism',
 'fascists',
 'means production sfn']

In [18]:
def save(data : pd.DataFrame, example_output : dict) -> None:   
    for i,j in zip(list(data.loc[INDEX[0]:INDEX[1], 'Word']), example_output):
            if len(j) != 0 :
                if len(j) == 1 and j[0] == 'camelcase':
                    continue
                with open(PATH, "a", newline="") as file:
                    writer = csv.writer(file)
                    writer.writerows([[i, j]])    

In [19]:
data = pd.read_csv('Index_words_eng.csv')
save(data=data, example_output=result_corpus)

In [20]:
len(result_corpus)

200