In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

In [2]:
# While loading the given CSV file into the dataframe, facing "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa3 in position 2245: invalid start byte"

import chardet
with open('24_train_2.csv','rb') as file:
  result = chardet.detect(file.read(10000))
  print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [3]:
df = pd.read_csv('24_train_2.csv', encoding='ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1429,sfa awaits report over mikoliunas the scottish...,sport
1,1896,parmalat to return to stockmarket parmalat th...,business
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport
3,2178,henman decides to quit davis cup tim henman ha...,sport
4,194,french suitor holds lse meeting european stock...,business


In [5]:
# Stemming tool from nltk
stemmer = PorterStemmer()

# A mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

In [6]:
def get_tokens(text):

  # turn document into lowercase
  lowers = text.lower()

  # remove punctuations
  no_punctuation = lowers.translate(remove_punctuation_map)

  # tokenize document
  tokens = nltk.word_tokenize(no_punctuation)

  # remove stop words
  filtered = [w for w in tokens if not w in stopwords.words('english')]

  # stemming process
  stemmed = []
  for item in filtered:
      stemmed.append(stemmer.stem(item))

  # final unigrams
  return stemmed

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

df['unigrams'] = df['Text'].apply(get_tokens)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,ArticleId,Text,Category,unigrams
0,1429,sfa awaits report over mikoliunas the scottish...,sport,"[sfa, await, report, mikoliuna, scottish, foot..."
1,1896,parmalat to return to stockmarket parmalat th...,business,"[parmalat, return, stockmarket, parmalat, ital..."
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport,"[edu, blast, arsen, arsen, brazilian, midfield..."
3,2178,henman decides to quit davis cup tim henman ha...,sport,"[henman, decid, quit, davi, cup, tim, henman, ..."
4,194,french suitor holds lse meeting european stock...,business,"[french, suitor, hold, lse, meet, european, st..."


In [8]:
# Load allowed words from the provided 'dictionary.txt' file
with open('dictionary.txt','r') as file:
  allowed_words = set(line.strip().lower() for line in file)

# Now we need to filter the unigrams as per the allowed words
def filter_unigrams(unigrams, allowed_words):
  return [word for word in unigrams if word in allowed_words]

df['filtered_unigrams'] = df['unigrams'].apply(lambda unigram : filter_unigrams(unigram, allowed_words))

df.head()

Unnamed: 0,ArticleId,Text,Category,unigrams,filtered_unigrams
0,1429,sfa awaits report over mikoliunas the scottish...,sport,"[sfa, await, report, mikoliuna, scottish, foot...","[report, scottish, footbal, associ, refere, re..."
1,1896,parmalat to return to stockmarket parmalat th...,business,"[parmalat, return, stockmarket, parmalat, ital...","[return, compani, went, account, hope, back, s..."
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport,"[edu, blast, arsen, arsen, brazilian, midfield...","[arsen, arsen, hit, club, offer, new, contract..."
3,2178,henman decides to quit davis cup tim henman ha...,sport,"[henman, decid, quit, davi, cup, tim, henman, ...","[decid, quit, davi, cup, great, britain, davi,..."
4,194,french suitor holds lse meeting european stock...,business,"[french, suitor, hold, lse, meet, european, st...","[french, hold, meet, european, stock, market, ..."


In [9]:
import numpy as np
import pandas as pd
import math

# Load the filtered unigrams
docs = df['filtered_unigrams'].tolist()

# Load the dictionary from the txt file
with open('dictionary.txt') as f:
    dictionary = [line.strip() for line in f]

# Initialize variables
n_docs = len(docs)
n_words = len(dictionary)
tf_matrix = np.zeros((n_docs, n_words))
idf_vector = np.zeros(n_words)

#Calculate TF
for i, doc in enumerate(docs):
  word_counts = {word: doc.count(word) for word in dictionary if word in doc}
  max_count = max(word_counts.values()) if word_counts else 1
  for j, word in enumerate(dictionary):
    tf_matrix[i, j] = word_counts.get(word, 0) / max_count

# Calculate IDF
for j, word in enumerate(dictionary):
    doc_count = sum(1 for doc in docs if word in doc)
    idf_vector[j] = math.log(n_docs / doc_count) if doc_count > 0 else 0

# Calculate TFIDF
tfidf_matrix = tf_matrix * idf_vector

# Round to 4 decimal places
tfidf_matrix = np.round(tfidf_matrix, 4)

# Convert to DataFrame to save as 'matrix.txt'
tfidf_df = pd.DataFrame(tfidf_matrix, columns=dictionary)
tfidf_df.to_csv('matrix.txt', sep=',', index=False, encoding='utf-8')

tfidf_df.head()

Unnamed: 0,000,1,10,100,11,12,13,14,15,16,...,worth,would,write,wrong,year,yet,york,young,yuko,zealand
0,0.2888,0.0,0.3604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5154,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.6293,0.0,0.0,0.0,0.1052,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.4432,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1374,0.0,0.0,0.1052,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5726,0.0,0.0,0.0,0.0,0.4581,0.0,0.0,0.0


In [22]:
import json
from collections import Counter

categories = df['Category'].tolist()
filtered_unigrams = df['filtered_unigrams'].tolist()  # Use 'filtered_unigrams' instead of raw text

# Initialize dictionaries to store results for both JSON files
top_frequency_words_by_category = {}
top_tfidf_words_by_category = {}

# Group by Category and calculate
for category in set(categories):
    indices = [i for i, cat in enumerate(categories) if cat == category]
    category_unigrams = [filtered_unigrams[i] for i in indices]

    # Flatten the list of all unigrams in the filtered category
    all_words = [word for unigrams in category_unigrams for word in unigrams]

    # Count the raw frequency of each word
    word_count = Counter(all_words)

    # Get the top 3 most frequent words
    top_frequency_words = dict(word_count.most_common(3))
    top_frequency_words_by_category[category] = top_frequency_words

    # If you still want to include TF-IDF-based scores:
    # Calculate the average TF-IDF score across all documents in this category
    category_tfidf = np.round(tfidf_df.iloc[indices].mean(axis=0), 4)
    top_tfidf_words = category_tfidf.nlargest(3)
    top_tfidf_words_by_category[category] = top_tfidf_words.to_dict()

# Prepare the JSON structure for frequency.json (Frequency)
frequency_data = json.dumps(top_frequency_words_by_category, indent=4)

# Prepare the JSON structure for scores.json (TF-IDF)
scores_data = json.dumps(top_tfidf_words_by_category, indent=4)

# Verify the contents
print("\nFrequency:")
print(frequency_data)
print("Scores (TF-IDF):")
print(scores_data)

with open('frequency.json', 'w') as json_file:
    json.dump(top_frequency_words_by_category, json_file, indent=4)

with open('scores.json', 'w') as json_file:
    json.dump(top_tfidf_words_by_category, json_file, indent=4)



Frequency:
{
    "tech": {
        "said": 757,
        "use": 459,
        "peopl": 427
    },
    "entertainment": {
        "film": 450,
        "said": 386,
        "year": 249
    },
    "politics": {
        "said": 996,
        "mr": 726,
        "would": 495
    },
    "sport": {
        "said": 428,
        "game": 353,
        "win": 288
    },
    "business": {
        "said": 724,
        "us": 377,
        "year": 360
    }
}
Scores (TF-IDF):
{
    "tech": {
        "mobil": 0.3463,
        "phone": 0.3319,
        "softwar": 0.3152
    },
    "entertainment": {
        "film": 0.7216,
        "award": 0.4106,
        "star": 0.408
    },
    "politics": {
        "labour": 0.4511,
        "elect": 0.4314,
        "mr": 0.4204
    },
    "sport": {
        "game": 0.3573,
        "england": 0.3191,
        "win": 0.3074
    },
    "business": {
        "firm": 0.2891,
        "bank": 0.2697,
        "market": 0.2616
    }
}
