In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import sys

# Get the current working directory
current_dir = os.getcwd()

# Construct the path to the 'preprocessing' folder
preprocessing_path = os.path.join(current_dir, '..')

# Add the path to the 'preprocessing' folder to sys.path
sys.path.append(preprocessing_path)
from preprocessing.preprocessor import preprocessor

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apatk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df=pd.read_csv("../data/bbc_text_cls.csv")

In [3]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
for i, row in df.iterrows():
    df.loc[i,"text"] = preprocessor(row['text']).preprocess()

In [8]:
df.head()

Unnamed: 0,text,labels
0,ad sales boost time warner profit quarterly pr...,business
1,dollar gains greenspan speech the dollar hit h...,business
2,yukos unit buyer faces loan claim the owners e...,business
3,high fuel prices hit bas profits british airwa...,business
4,pernod takeover talk lifts domecq shares uk dr...,business


### Convert This text data to numerical form using td-idf Vectorization technique

In [9]:
# Convert documents into sequences of ints/ids/indices
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc)
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx+=1

        #Save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [12]:
# Reverse Mapping
idx2Word = {v:k for k,v in word2idx.items()}

In [13]:
# Number of documents
N=len(df['text'])

#Number of words/ Vocab Lenght
V = len(word2idx)

In [15]:
# Initiate term - Frequency matrix
# Note: could have also used count vectorizer
tf = np.zeros((N,V))

#Populate term frequency count
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i,j]+=1

In [16]:
# Compute IDF
document_freq = np.sum(tf>0, axis = 0) #document frequency (shape = (V,))
idf = np.log(N/document_freq)

In [17]:
# compute TF - IDF
tf_idf = tf * idf

In [18]:
tf_idf

array([[5.06845486, 9.5185441 , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.70751219, 7.70751219,
        7.70751219]])