## Count-Vectorizing words and calculating TF-IDF Scores from scratch

In [1]:
import re
import math
import pandas as pd

In [2]:
df = pd.read_excel('/Users/BPLEE/Desktop/Python/News Articles.xlsx')

In [3]:
df.head()

Unnamed: 0,Article,Topic
0,A second attempt to pass a bipartisan disaster...,Disaster
1,The sweeping tax law Republicans enacted in la...,Economic Policy
2,Senate Majority Leader Mitch McConnell (R-Ky.)...,Court


In [4]:
df['Article'][0]

'A second attempt to pass a bipartisan disaster aid package has failed after a House Republican voted on Tuesday against a bill that would secure $19 billion in emergency relief.  Rep. Thomas Massie (R-Ky.) objected to the legislation’s passage during a voice vote and asked that another vote be held after the House returns from recess next week.'

In [5]:
# Store documents in a list via iteritems() in order to clean/process iteratively 
docs = []
for i, article in df['Article'].iteritems():
    docs.append(article)
docs

['A second attempt to pass a bipartisan disaster aid package has failed after a House Republican voted on Tuesday against a bill that would secure $19 billion in emergency relief.  Rep. Thomas Massie (R-Ky.) objected to the legislation’s passage during a voice vote and asked that another vote be held after the House returns from recess next week.',
 'The sweeping tax law Republicans enacted in late 2017 is definitely not paying for itself and has not significantly boosted the economy or increased wages, the non-partisan Congressional Research Service said in a report.  But in line with what critics cautioned, the measure triggered a wave of corporate stock buybacks that benefited investors more than anybody else, according to the new study.',
 'Senate Majority Leader Mitch McConnell (R-Ky.) said Tuesday he would work to fill any Supreme Court vacancy in 2020, an election year, despite his efforts to scuttle Judge Merrick Garland’s nomination to the bench for that very reason in 2016.  

## Process documents and tokenize individual words

In [6]:
# Function takes in a list of documents, removes special chars, white spaces, stop words, 
# tokenizes words, and adds them to a new list. 

tokenized_docs = []
def clean_up(docs):
    for doc in docs:
        doc = doc.lower()
        # Replace special characters with space
        doc = re.sub('[^\w\s]', '', doc)
        doc = re.sub('_', '', doc)
        # Replace any whitespace with 1 space
        doc = re.sub('\s+', ' ', doc)
        # Remove start and end whitespaces
        doc = doc.strip()
        # Tokenize each word by splitting by spaces
        doc = doc.split(' ')
        
        # Remove stop words if desired (a, the, of, to, etc.)
        
        tokenized_docs.append(doc)

In [7]:
clean_up(docs)

In [8]:
print(tokenized_docs)

[['a', 'second', 'attempt', 'to', 'pass', 'a', 'bipartisan', 'disaster', 'aid', 'package', 'has', 'failed', 'after', 'a', 'house', 'republican', 'voted', 'on', 'tuesday', 'against', 'a', 'bill', 'that', 'would', 'secure', '19', 'billion', 'in', 'emergency', 'relief', 'rep', 'thomas', 'massie', 'rky', 'objected', 'to', 'the', 'legislations', 'passage', 'during', 'a', 'voice', 'vote', 'and', 'asked', 'that', 'another', 'vote', 'be', 'held', 'after', 'the', 'house', 'returns', 'from', 'recess', 'next', 'week'], ['the', 'sweeping', 'tax', 'law', 'republicans', 'enacted', 'in', 'late', '2017', 'is', 'definitely', 'not', 'paying', 'for', 'itself', 'and', 'has', 'not', 'significantly', 'boosted', 'the', 'economy', 'or', 'increased', 'wages', 'the', 'nonpartisan', 'congressional', 'research', 'service', 'said', 'in', 'a', 'report', 'but', 'in', 'line', 'with', 'what', 'critics', 'cautioned', 'the', 'measure', 'triggered', 'a', 'wave', 'of', 'corporate', 'stock', 'buybacks', 'that', 'benefited'

## Build a corpus of all words with index positions in a dict.

In [9]:
# Build a corpus containing all words across all documents
# Assign an index position to each 
corpus = {}
i = 1
for doc in tokenized_docs:
    for word in doc:
        if word in corpus:
            continue
        corpus[word] = i
        i = i+1

In [10]:
print(corpus)

{'a': 1, 'second': 2, 'attempt': 3, 'to': 4, 'pass': 5, 'bipartisan': 6, 'disaster': 7, 'aid': 8, 'package': 9, 'has': 10, 'failed': 11, 'after': 12, 'house': 13, 'republican': 14, 'voted': 15, 'on': 16, 'tuesday': 17, 'against': 18, 'bill': 19, 'that': 20, 'would': 21, 'secure': 22, '19': 23, 'billion': 24, 'in': 25, 'emergency': 26, 'relief': 27, 'rep': 28, 'thomas': 29, 'massie': 30, 'rky': 31, 'objected': 32, 'the': 33, 'legislations': 34, 'passage': 35, 'during': 36, 'voice': 37, 'vote': 38, 'and': 39, 'asked': 40, 'another': 41, 'be': 42, 'held': 43, 'returns': 44, 'from': 45, 'recess': 46, 'next': 47, 'week': 48, 'sweeping': 49, 'tax': 50, 'law': 51, 'republicans': 52, 'enacted': 53, 'late': 54, '2017': 55, 'is': 56, 'definitely': 57, 'not': 58, 'paying': 59, 'for': 60, 'itself': 61, 'significantly': 62, 'boosted': 63, 'economy': 64, 'or': 65, 'increased': 66, 'wages': 67, 'nonpartisan': 68, 'congressional': 69, 'research': 70, 'service': 71, 'said': 72, 'report': 73, 'but': 74,

## Vectorize each doc with n=len(corpus) dimensions 

In [11]:
# Create vectors of zero of each document
# Iterate through each word in each doc, and add 1 to the word's index position 

# Note: vectors are (n x 1) matrices, but are rewritten horizontally with commas btwn dimensions

vectorized_docs = []
def vectorize(tokenized_docs):
    i = 1
    for doc in tokenized_docs:
        zeros_vector = [f'Doc{i}'] + [0]*len(corpus)
        vectorized_docs.append(zeros_vector)
        for word in doc:
            zeros_vector[corpus[word]] = zeros_vector[corpus[word]]+1
        i = i+1

In [12]:
vectorize(tokenized_docs)

In [13]:
print(vectorized_docs[0])
print(vectorized_docs[1])
print(vectorized_docs[2])

['Doc1', 5, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Doc2', 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Doc3', 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 1, 0, 0, 0, 5, 0, 0, 0, 0, 0, 1, 0, 4, 0, 0

## Generate Term Frequency TF(t) scores

In [14]:
# TF(t) = (# of times term t appears in doc) / (total number of terms in doc)
# IDF(t,d) = log(Total number of docs / number of docs with term t in it)  math.log(doc#/doc(t))
# Tf-idf = TF x IDF

# TF scores
tf_scores = []
def tf_score(vectorized_docs):
    for doc in vectorized_docs:
        tf_vectors = []
        n = 0
        for dimension in doc[1:]:
            tf = dimension / sum(doc[1:])
            tf_vectors.append((doc[0], f'{list(corpus)[n]}',tf))   # list(corpus)[0] = 'a'
            n = n + 1
        tf_scores.append(tf_vectors)

In [15]:
tf_score(vectorized_docs)

In [16]:
print(tf_scores[0][0:5])
print(tf_scores[1][0:5])
print(tf_scores[2][0:5])

[('Doc1', 'a', 0.08620689655172414), ('Doc1', 'second', 0.017241379310344827), ('Doc1', 'attempt', 0.017241379310344827), ('Doc1', 'to', 0.034482758620689655), ('Doc1', 'pass', 0.017241379310344827)]
[('Doc2', 'a', 0.03225806451612903), ('Doc2', 'second', 0.0), ('Doc2', 'attempt', 0.0), ('Doc2', 'to', 0.016129032258064516), ('Doc2', 'pass', 0.0)]
[('Doc3', 'a', 0.012345679012345678), ('Doc3', 'second', 0.0), ('Doc3', 'attempt', 0.0), ('Doc3', 'to', 0.04938271604938271), ('Doc3', 'pass', 0.0)]


## Generate IDF(t, d) scores

In [17]:
# Preview list(corpus)

list(corpus)[0:5]

['a', 'second', 'attempt', 'to', 'pass']

In [18]:
# IDF(t, d) scores - this is the weight of each word across all documents. 
# IDF(t, d) = log(Total number of docs / number of docs with term t in it)  math.log(doc#/doc(t))

idf_scores = []
docs_with_word = [0]*len(corpus)  # Vector of 146 zeros
def idf_score(corpus, tokenized_docs):
    i = 0 
    for word in list(corpus):
        for doc in tokenized_docs:
            if word in doc:
                docs_with_word[i] = docs_with_word[i] + 1
        i = i + 1
    print(f'No. of docs containing each word: {docs_with_word}')
    
    i = 0
    for doc_num in docs_with_word:
        idf = math.log(len(tokenized_docs) / doc_num)
        idf_scores.append((list(corpus)[i], idf))
        i = i + 1

In [19]:
idf_score(corpus, tokenized_docs)

No. of docs containing each word: [3, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [20]:
# 'a' has a low score because it's present in all docs, therefore less meaningful 

idf_scores[0:10] 

[('a', 0.0),
 ('second', 1.0986122886681098),
 ('attempt', 1.0986122886681098),
 ('to', 0.0),
 ('pass', 1.0986122886681098),
 ('bipartisan', 1.0986122886681098),
 ('disaster', 1.0986122886681098),
 ('aid', 1.0986122886681098),
 ('package', 1.0986122886681098),
 ('has', 0.4054651081081644)]

## Generate TF-IDF(t, d) scores

In [21]:
# Preview tf_scores (list of lists)

print(tf_scores[0][0:5])

print('\n')
print(tf_scores[0][0][2])

[('Doc1', 'a', 0.08620689655172414), ('Doc1', 'second', 0.017241379310344827), ('Doc1', 'attempt', 0.017241379310344827), ('Doc1', 'to', 0.034482758620689655), ('Doc1', 'pass', 0.017241379310344827)]


0.08620689655172414


In [22]:
# TF-IDF score
# Multiple the term frequency (TF) by the weight (IDF), for words in every doc

tfidf_scores = []
def tfidf_score(tf_scores, idf_scores):
    for doc in tf_scores:
        i = 0
        doc_tfidf = []
        for coord in doc: 
            tfidf = coord[2] * idf_scores[i][1]
            doc_tfidf.append((coord[0], coord[1], idf_scores[i][0], tfidf))
            i = i + 1
        tfidf_scores.append(doc_tfidf)

In [23]:
tfidf_score(tf_scores, idf_scores)

In [24]:
tfidf_scores

[[('Doc1', 'a', 'a', 0.0),
  ('Doc1', 'second', 'second', 0.01894159118393293),
  ('Doc1', 'attempt', 'attempt', 0.01894159118393293),
  ('Doc1', 'to', 'to', 0.0),
  ('Doc1', 'pass', 'pass', 0.01894159118393293),
  ('Doc1', 'bipartisan', 'bipartisan', 0.01894159118393293),
  ('Doc1', 'disaster', 'disaster', 0.01894159118393293),
  ('Doc1', 'aid', 'aid', 0.01894159118393293),
  ('Doc1', 'package', 'package', 0.01894159118393293),
  ('Doc1', 'has', 'has', 0.006990777726002834),
  ('Doc1', 'failed', 'failed', 0.01894159118393293),
  ('Doc1', 'after', 'after', 0.03788318236786586),
  ('Doc1', 'house', 'house', 0.03788318236786586),
  ('Doc1', 'republican', 'republican', 0.01894159118393293),
  ('Doc1', 'voted', 'voted', 0.01894159118393293),
  ('Doc1', 'on', 'on', 0.006990777726002834),
  ('Doc1', 'tuesday', 'tuesday', 0.006990777726002834),
  ('Doc1', 'against', 'against', 0.01894159118393293),
  ('Doc1', 'bill', 'bill', 0.01894159118393293),
  ('Doc1', 'that', 'that', 0.0),
  ('Doc1', 'w

In [None]:
# Each set of TF-IDF scores (145 x 1 vector) is then input into a classification algorithm of choice