In [82]:
import json
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [83]:
docs_path = 'docs'
txts = []
files = [f for f in listdir(docs_path) if isfile(join(docs_path, f))]

for f in files:
    with open(docs_path + '/' + f) as file:
        txts.append(file.read())

In [84]:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
count = vectorizer.fit_transform(txts)
tfidf_matrix = transformer.fit_transform(count)
print(tfidf_matrix.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.00053582 0.00045116 ... 0.00090233 0.         0.00038353]
 [0.         0.00038137 0.         ... 0.         0.0019267  0.00054596]
 ...
 [0.00038013 0.00022573 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [85]:
vocabulary = dict(sorted(vectorizer.vocabulary_.items()))

with open('vocabulary.json', 'w') as file:
        json.dump(vocabulary, file, indent = 2)

In [86]:
matrix = tfidf_matrix.T.todense()
feature_names = vectorizer.get_feature_names_out()

In [87]:
df = pd.DataFrame(matrix, index = feature_names, columns = files)
df

Unnamed: 0,Douglas Adams - Hitchhikers Trilogy - Restaurant End of the Universe.txt,Douglas Adams - Hitchhikers Trilogy - Mostly Harmless.txt,Douglas Adams - Hitchhikers Trilogy - Hitchhikers Guide to the Galaxy.txt,Jane Austen - Persuasion.txt,Poul Anderson - In Memoriam.txt,Jane Austen - Northanger Abbey.txt,Jane Austen - Pride and Prejudice.txt,Douglas Adams - The Long Dark Tea Time of the Soul.txt,Edwin Arnold - Guliver of Mars.txt,"Douglas Adams - Hitchhikers Trilogy - So Long, and Thanks for All the Fish.txt"
00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000380,0.0,0.0
000,0.000000,0.000536,0.000381,0.0,0.015418,0.0,0.001311,0.000226,0.0,0.0
004,0.000000,0.000451,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
03758,0.000000,0.000000,0.000642,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
040700,0.000000,0.000000,0.000000,0.0,0.006491,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
zoom,0.000000,0.000384,0.000546,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
zowee,0.000535,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
zwingler,0.000000,0.000902,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
zz,0.000000,0.000000,0.001927,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
