In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from gensim.parsing.preprocessing import remove_stopwords
import pandas as pd
import nltk
import re

In [2]:
data = pd.read_csv('clean_sopwords_Index_words_eng.csv')
data.head()

Unnamed: 0,Index,Word
0,0,accessiblecomputing
1,1,anarchism
2,2,afghanistanhistory
3,3,afghanistangeography
4,4,afghanistanpeople


In [3]:
index = list(data.Index)[150:215]
corpus = []

In [4]:
for i in index:
    with open(f"dataset_eng/{i}.txt", 'r') as fl:
        text = fl.read().lower()
        text = re.sub(r'/(style=")([a-zA-Z0-9:;\.\s\(\)\-\,]*)(")/gi', '', text)
        text = re.sub(r'ref|url|link', '', text)      
        text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»""‘’]))', '', text)
        text = re.sub(r"[^a-zA-Z\.]+", " ", text)
      
        corpus.append(remove_stopwords(text))

# TF-IDF Vectorizer

In [5]:
tfidf = TfidfVectorizer()
vect = tfidf.fit_transform(corpus)

In [6]:
df = pd.DataFrame()
df['vocabulary'] = tfidf.get_feature_names()
for i, _ in enumerate(vect.toarray()):
    df[f'text{i}'] = vect.toarray()[i]

df.set_index('vocabulary', inplace=True)
df.T.head()

vocabulary,aa,aaa,aaai,aaboe,aabw,aac,aaf,aamir,aanleg,aarau,...,zutphen,zvelebil,zweig,zwischen,zx,zygomatic,zyl,zylber,zyrtare,zz
text0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
text1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
text2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00833,0.0,0.0,0.0,0.0
text3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
text4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
sorted_text0 = df.sort_values('text53', ascending = False) 
sorted_text0.iloc[:, [53]].head(8)

Unnamed: 0_level_0,text53
vocabulary,Unnamed: 1_level_1
algorithm,0.663806
algorithms,0.405333
turing,0.213068
machine,0.111786
euclid,0.111613
knuth,0.106534
kleene,0.100077
problem,0.087385


# N-Gram + TF-IDF

In [8]:
def n_gram_tfidf( text_=corpus[53:54], n= 4):
    
    data = []
    size = (1,1)
    for i in range(2, n + 1):
        size = (i, i)
        vectorizer = CountVectorizer(ngram_range =size) 
        X1 = vectorizer.fit_transform(text_)  
        features = (vectorizer.get_feature_names()) 

        # Applying TFIDF 
        # You can still get n-grams here 
        vectorizer = TfidfVectorizer(ngram_range = size) 
        X2 = vectorizer.fit_transform(corpus) 
        scores = (X2.toarray()) 

        # Getting top ranking features 
        sums = X2.sum(axis = 0) 

        for col, term in enumerate(features): 
            data.append( (term, sums[0, col] )) 
        print(i)
    return data


In [9]:
res = n_gram_tfidf()

2
3
4


In [10]:
ranking = pd.DataFrame(res, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
words.head(25)

Unnamed: 0,term,rank
22879,work leading problem answer,1.515337
14353,turing completeness exact,1.403206
5706,simple abacus,1.389437
1247,colin allen,0.937862
1309,completely rationally,0.550691
22144,telephony electromechanical relay bell,0.538937
13673,statements based natural,0.511085
5240,randomly pseudo,0.487147
6646,unsolvable emil,0.480402
6064,structures national,0.428865
