 import the necessary libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

Given two simple documents containing one sentence each

In [2]:
documentA = 'This is the first document.'
documentB = 'This document is the second document.'

bag of words

In [3]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

Removing any duplicate words

In [4]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

Dictionary of words and their occurence for each document in the corpus (collection of documents)

In [5]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [6]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Term Frequency (TF)

In [7]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

compute the term frequency for each of our documents

In [8]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

### Inverse Data Frequency (IDF)

In [9]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

The IDF is computed once for all documents.

In [10]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [11]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

TF-IDF scores for all the words in the corpus

In [12]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [13]:
df.head()

Unnamed: 0,the,first,document.,is,second,document,This
0,0.0,0.138629,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.115525,0.115525,0.0


### Translate the German sentence to English using Googletrans app (Use NLTK corpus)

##### importing required packages

In [14]:
from nltk.tokenize import sent_tokenize
from googletrans import Translator  
translator = Translator()  

#### German paragraph copy from https://www.gutenberg.org/cache/epub/6343/pg6343.txt

In [17]:
german_text ="""
Der Metaphysik, einer ganz isolierten spekulativen Vernunfterkenntnis, die sich gänzlich über Erfahrungsbelehrung erhebt, und zwar durch
bloße Begriffe (nicht wie Mathematik durch Anwendung derselben auf Anschauung), wo also Vernunft selbst ihr eigener Schüler sein soll,
ist das Schicksal bisher noch so günstig nicht gewesen, daß sie den sicheren Gang einer Wissenschaft einzuschlagen vermocht hätte; ob sie
gleich älter ist, als alle übrige, und bleiben würde, wenn gleich die übrigen insgesamt in dem Schlunde einer alles vertilgenden Barbarei
gänzlich verschlungen werden sollten. Denn in ihr gerät die Vernunft kontinuierlich in Stecken, selbst wenn sie diejenigen Gesetze, welche
die gemeinste Erfahrung bestätigt, (wie sie sich anmaßt) a priori einsehen will. In ihr muß man unzählige Male den Weg zurück tun, weil
man findet, daß er dahin nicht führt, wo man hin will, und was die Einhelligkeit ihrer Anhänger in Behauptungen betrifft, so ist sie
noch so weit davon entfernt, daß sie vielmehr ein Kampfplatz ist, der ganz eigentlich dazu bestimmt zu sein scheint, seine Kräfte im
Spielgefechte zu üben, auf dem noch niemals irgend ein Fechter sich auch den kleinsten Platz hat erkämpfen und auf seinen Sieg einen
dauerhaften Besitz gründen können. Es ist also kein Zweifel, daß ihr Verfahren bisher ein bloßes Herumtappen, und, was das Schlimmste ist,
unter bloßen Begriffen, gewesen sei.
"""

#### Sentence Tokenization – Splitting sentences in the paragraph
#### Translating from German to English using googletrans

In [18]:
for sent in sent_tokenize(german_text):
    translate_text = translator.translate(sent, src='de', dest='en').text
    print(translate_text)

Metaphysics, a very isolated speculative scientific knowledge, which is entirely due to experience of experience, through
mere concepts (not like mathematics by applying them on view), where reason itself should be their own student,
If fate has not been so favorably so favored that she would have suffered the safe gear of a science;whether you
equals older than all the rest, and would remain, if the remaining overall in the final of an all-erupted barbarism
should be devoured completely.
Because in their device the reason continuously stuck in, even if they have those laws, which
The most common experience confirms (as you care) a priori wants to see.
You have to do the path back in it, because
One finds that he does not lead to where to go, and as far as the uniformity of her followers is concerned in allegations, she is
so far from it that it is rather a battleplace, which is certainly intended to be, his forces in the
Practicing game battles, on which never any fencer has to fight 