# Text Analysis Pipeline

## Overview
This project implements a text analysis pipeline that processes documents using NLP techniques. The pipeline transforms raw text into meaningful features using TF-IDF vectorization.

## Main Components

### Data Processing
- Text normalization (lowercase, punctuation removal)
- Word tokenization 
- Stopword removal
- Porter stemming
- Dictionary-based word filtering

### Feature Engineering
- Term Frequency (TF) calculation
- Inverse Document Frequency (IDF) computation  
- TF-IDF matrix generation

### Output Generation
- Matrix of TF-IDF scores (`matrix.txt`)
- Top frequent words per category (`frequency.json`)
- Highest TF-IDF words per category (`scores.json`)

## Data Requirements
- Input: `24_train_2.csv` with text and category columns
- Dictionary: `dictionary.txt` with predefined keywords
- Python packages: NLTK, pandas, numpy

In [1]:
# install dependencies
%pip install nltk pandas numpy ipykernel

Note: you may need to restart the kernel to use updated packages.


In [2]:
# import dependencies
import pandas as pd
import numpy as np
from collections import Counter
import json
import csv
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

In [3]:
# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# stemming tool from nltk
stemmer = PorterStemmer()

# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

# load the csv file
df = pd.read_csv('24_train_2.csv')

# load the dictionary
with open('dictionary.txt') as keywords:
    dictionary = keywords.read().splitlines()

dictionary_set = set(dictionary)

# Precompute stopwords set
stop_words = set(stopwords.words('english'))


In [5]:
# Filtering out the data
# Apply your get_tokens function to each document
def get_tokens(text):
    lowers = text.lower()
    no_punctuation = lowers.translate(remove_punctuation_map)
    tokens = nltk.word_tokenize(no_punctuation)
    stemmed = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return stemmed

# Apply the get_tokens function to the 'Text' column and store results in a new 'Tokens' column
df['Unigrams'] = df['Text'].apply(get_tokens)

# Filter the unigrams to include the words predefined in dictionary
def filter_unigrams(tokens):
    return [token for token in tokens if token in dictionary]

df['Unigrams'] = df['Unigrams'].apply(filter_unigrams)

In [6]:
# Calculating the Term Frequenvcy
def compute_tf(tokens):
    tf = Counter(tokens)
    max_freq = max(tf.values())
    return {word: freq / max_freq for word, freq in tf.items()}

df['TF'] = df['Unigrams'].apply(compute_tf)

tf_matrix = np.zeros((len(df), len(dictionary)))

# Populate the TF matrix
for idx, tf_dict in enumerate(df['TF']):
    for word, tf_value in tf_dict.items():
        if word in dictionary:
            col_idx = dictionary.index(word)
            tf_matrix[idx, col_idx] = tf_value

In [7]:
# Calculate the Inverse Document Frequency
def compute_df(unigrams):
    df = Counter()
    for tokens in unigrams:
        unique_tokens = set(tokens)
        for token in unique_tokens:
            df[token] += 1
    return df

df_values = compute_df(df['Unigrams'])

# Number of documents
N = len(df)

# Calculate IDF for each word in the dictionary
idf_values = {}
for word in dictionary:
    df_word = df_values.get(word, 0)  # Get DF for the word, default to 0 if not present
    idf_values[word] = math.log(N / df_word )

In [8]:
# Convert idf values to a vector
idf_vector = np.array([idf_values[word] for word in dictionary])
print(idf_vector)

# Multiply each column of the TF matrix by the corresponding IDF value
tfidf_matrix = tf_matrix * idf_vector

# Optionally, round the resulting matrix to 4 decimal places
tfidf_matrix = np.round(tfidf_matrix, 4)

# Save the TF-IDF matrix to a CSV file
np.savetxt('matrix.txt', tfidf_matrix, delimiter=',', fmt='%.4f', encoding='utf-8')

[1.44392347 2.61729584 1.80180981 2.65926004 2.60369019 2.24431618
 2.88240359 3.05760768 2.71810054 3.05760768 2.79688141 2.57702194
 2.28278247 2.81341072 2.71810054 2.67364877 2.02495336 1.69281952
 1.98050159 3.07911388 3.01593498 2.67364877 2.90042209 2.76462055
 2.84731227 2.79688141 2.71810054 2.67364877 3.14655516 2.16282315
 2.79688141 2.61729584 1.84516025 2.44184716 2.70306266 2.88240359
 2.3859667  2.37515579 2.11196473 2.93746337 2.68824757 3.41124772
 1.29098418 3.07911388 3.38139475 2.78062089 2.97592965 3.01593498
 2.70306266 2.79688141 2.39689577 2.46510402 2.97592965 2.3330443
 3.35240722 2.29263476 3.14655516 3.77226106 3.54045945 3.32423634
 1.94491065 2.43041846 1.72036947 0.57270103 2.12026354 2.47693848
 2.93746337 2.51330612 2.30258509 2.73336801 2.08747371 2.02495336
 2.71810054 1.96611286 3.44201938 3.19418321 2.91877123 2.17155683
 3.72970145 3.07911388 3.29683737 2.95651156 2.51330612 2.73336801
 1.98777435 3.47376807 3.54045945 3.44201938 2.18925641 2.56394

In [9]:
# Compute the top 3 most frequent words and top 3 TFIDF words for each category
frequency = {}
scores = {}

# Looping through every category
for category in df['Category'].unique():
    # Calculate the top 3 most frequent words
    category_df = df[df['Category'] == category]
    all_tokens = sum(category_df['Unigrams'], [])
    word_counts = Counter(all_tokens)
    frequency[category] = dict(word_counts.most_common(3))

    # Calculate the average TFIDF of every word
    avg_tfidf = {}
    for word in word_counts.keys():
        column_index = list(dictionary).index(word)
        tfidf_column = tfidf_matrix[:, column_index]
        mean_tfidf = tfidf_column.mean()
        avg_tfidf[word] = mean_tfidf

    # Get the top 3 words with the highest average TF-IDF scores
    top_tfidf_words = sorted(avg_tfidf.items(), key=lambda item: item[1], reverse=True)[:3]

    # Format the scores in the desired structure
    scores[category] = {word: round(score, 4) for word, score in top_tfidf_words}


# Save the results to JSON files
with open('frequency.json', 'w') as f:
    json.dump(frequency, f, indent=4)

with open('scores.json', 'w') as f:
    json.dump(scores, f, indent=4)
