In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from gensim.parsing.preprocessing import remove_stopwords
from nltk import sent_tokenize
from keybert import KeyBERT
import pandas as pd
import re

In [2]:
data = pd.read_csv('Index_words_eng.csv')
data.head()

Unnamed: 0,Index,Word
0,0,accessiblecomputing
1,1,anarchism
2,2,afghanistanhistory
3,3,afghanistangeography
4,4,afghanistanpeople


In [3]:
index = 1215
data.iloc[[index]]

Unnamed: 0,Index,Word
1215,1215,astounding magazine


In [4]:
def append_corpus(index: list or tuple =[0,10], corpus:list = []) -> None:
    '''Чтение файлов с текстами собирая их в масси для последущих манипуляций'''
    index = list(data.Index)[index[0]: index[1]]
    for i in index:
        with open(f"dataset_eng/{i}.txt", 'r') as fl:
            text = fl.read().lower()
            #text = re.sub('[a-zA-Z0-9:;\.\s\(\)\-\,]', '', text)
            text = re.sub(r'/(style=")([a-zA-Z0-9:;\.\s\(\)\-\,]*)(")/gi', '', text)
            text = re.sub(r'ref|url|link|title|aa|url', '', text)
            text = re.sub(r"title", '', text)
            text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»""‘’]))', '', text)
            text = re.sub(r"[^a-zA-Z\.]+", " ", text)
       
        corpus.append(remove_stopwords(text))

In [5]:
def key_words(corpus) -> list:
    """Выделение ключевых слов с использованием BERT"""
    res = []
    for text in corpus:
        try:
            model = KeyBERT('distilbert-base-nli-mean-tokens')
            keywords = model.extract_keywords(text, keyphrase_length=2, use_maxsum=True)
            [keywords.append(i) for i in model.extract_keywords(text, keyphrase_length=1, use_maxsum=True)];
        except Exception as e:
            #print(e)
            info = e
            keywords = ['None' ] * 10
        res.append(keywords)
    return res

In [6]:
def create_DataFrame(data:pd.DataFrame, result, index) -> pd.DataFrame:
    return pd.DataFrame({i:j for i,j in zip(list(data.loc[index[0]:index[1], 'Word']), res)})

In [7]:
corpus = []
index=[0, 50]
append_corpus(index=index, corpus=corpus)

In [8]:
len(corpus)

50

In [9]:
res = key_words(corpus)

In [10]:
def n_gram_tfidf(corpus, res, n= 4) -> None:
    """ИСпользование n-Grams & TF-IDf"""
    data = []
    size = (1,1)
    for index_text, text in enumerate(corpus):
        data = []
        for i in range(2, n + 1):
            size = (i, i)
            vectorizer = CountVectorizer(ngram_range =size) 
            try:
                X1 = vectorizer.fit_transform(sent_tokenize(text))  
                features = (vectorizer.get_feature_names()) 

                # Applying TFIDF 
                # You can still get n-grams here 
                vectorizer = TfidfVectorizer(ngram_range = size) 
                X2 = vectorizer.fit_transform(corpus) 
                scores = (X2.toarray()) 

                # Getting top ranking features 
                sums = X2.sum(axis = 0) 

                for col, term in enumerate(features): 
                    data.append( (term, sums[0, col] )) 
            except Exception as e:
                info = e
                #print(e)
        ranking = pd.DataFrame(data, columns = ['term', 'rank']) 
        words = (ranking.sort_values('rank', ascending = False)) 
        [res[index_text].append(_) for _ in list(words.head().term.values)]

In [11]:
n_gram_tfidf(corpus=corpus, res = res)

In [12]:
for i in res:
    if len(i) < max([len(i) for i in res]):
        while len(i)< 15:
            i.append('None')

In [13]:
create_DataFrame(data=data, result=res, index=index)

Unnamed: 0,accessiblecomputing,anarchism,afghanistanhistory,afghanistangeography,afghanistanpeople,afghanistancommunications,afghanistantransportations,afghanistanmilitary,afghanistantransnationalissues,assistivetechnology,...,aynrand,alexanderthegreat,anchoragealaska,argumentforms,argumentsfortheexistenceofgod,anarchy,asciiart,academyawards,academyawards/bestpicture,austrialanguage
0,,sectarianism anarchist,category shell,category shell,category shell,category shell,category shell,category shell,category shell,assistive technology,...,category shell,great redirect,category shell,logical form,existence god,category shell,ascii art,category shell,camelcase subpage,category shell
1,,history libertarian,redirect category,redirect category,redirect category,redirect category,redirect category,armed forces,foreign relations,category shell,...,redirect category,alexander great,redirect category,category shell,redirect category,redirect category,redirect category,redirect academy,redirect academy,redirect category
2,,libertarian political,history afghanistan,geography afghanistan,redirect demographics,redirect communications,shell camelcase,redirect category,redirect category,technology redirect,...,ayn rand,category shell,anchorage alaska,form redirect,category shell,redirect anarchy,category shell,academy awards,shell camelcase,shell camelcase
3,,century anarchists,shell camelcase,shell camelcase,demographics afghanistan,shell camelcase,transport afghanistan,shell camelcase,relations afghanistan,redirect category,...,redirect ayn,redirect alexander,shell camelcase,redirect category,god redirect,anarchy redirect,art redirect,shell camelcase,best picture,german redirect
4,,economics anarchist,afghanistan redirect,afghanistan redirect,shell camelcase,communications afghanistan,camelcase unprintworthy,afghan armed,shell camelcase,shell camelcase,...,shell camelcase,shell camelcase,alaska redirect,shell camelcase,shell camelcase,shell camelcase,shell camelcase,awards redirect,academy award,austrian german
5,,liberalism,history,category,demographics,communications,category,armed,relations,assistive,...,category,great,category,logical,category,category,art,academy,subpage,shell
6,,capitalist,shell,shell,shell,shell,shell,shell,shell,technology,...,rand,shell,shell,category,god,shell,ascii,shell,academy,redirect
7,,fascists,redirect,redirect,redirect,redirect,afghanistan,redirect,redirect,shell,...,shell,alexander,redirect,shell,shell,anarchy,shell,redirect,best,camelcase
8,,counterculture,afghanistan,afghanistan,afghanistan,afghanistan,camelcase,afghan,afghanistan,redirect,...,redirect,redirect,alaska,redirect,redirect,redirect,redirect,awards,shell,german
9,,postcolonialism,camelcase,camelcase,camelcase,camelcase,unprintworthy,camelcase,camelcase,camelcase,...,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,camelcase,austrian


In [14]:
result_data = create_DataFrame(data=data, result=res, index=index)

In [15]:
result_data.to_csv('result.csv')