In [65]:
import pandas as panda
from google.colab import drive
drive.mount('/content/drive')
data = panda.read_csv('/content/drive/My Drive/Colab Notebooks/24_train_2.csv', encoding='ISO-8859-1')
def load_dictionary_list(file_path):
    with open(file_path, 'r') as file:
        dictionary_list = list(file.read().splitlines())
    return dictionary_list
def load_dictionary_set(file_path):
    with open(file_path, 'r') as file:
        dictionary_set = set(file.read().splitlines())
    return dictionary_set
dictionary_list = load_dictionary_list('/content/drive/My Drive/Colab Notebooks/dictionary.txt')
dictionary_set = load_dictionary_set('/content/drive/My Drive/Colab Notebooks/dictionary.txt')
print(data)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
     ArticleId                                               Text  \
0         1429  sfa awaits report over mikoliunas the scottish...   
1         1896  parmalat to return to stockmarket parmalat  th...   
2         1633  edu blasts arsenal arsenal s brazilian midfiel...   
3         2178  henman decides to quit davis cup tim henman ha...   
4          194  french suitor holds lse meeting european stock...   
..         ...                                                ...   
995       1250  blair  damaged  by blunkett row a majority of ...   
996       1639  a november to remember last saturday  one news...   
997        916  highbury tunnel players in clear the football ...   
998       2217  top stars join us tsunami tv show brad pitt  r...   
999        902  eastwood s baby scoops top oscars clint eastwo...   

          Category  
0            sport  


In [66]:
import string
import nltk
import numpy as numpy
#download data packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem.porter import *

# Stemming tool from nltk
stemmer = PorterStemmer()

# A mapping dictionary that helps remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def get_tokens(text):
    # Turn document into lowercase
    lowers = text.lower()
    # Remove punctuations
    no_punctuation = lowers.translate(remove_punctuation_map)
    # Tokenize document
    tokens = nltk.word_tokenize(no_punctuation)
    # Remove stop words
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    # Stemming process
    stemmed = []
    for item in filtered:
        stemmed.append(stemmer.stem(item))
    # Return final unigrams
    return stemmed

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [67]:
processed_texts=data['Text'].apply(lambda proc_text: get_tokens(proc_text))
filter_words=processed_texts.apply(lambda filt_word: [filter_word for filter_word in filt_word if filter_word in dictionary_set])

In [68]:
def calculate_idf(filtered_words, dictionary_list):
    idf = {}
    doc_count = numpy.zeros(len(dictionary_list))
    for i, term in enumerate(dictionary_list):
        for doc in filtered_words:
            if term in doc:
                 doc_count[i] += 1
        idf[term] = numpy.log((len(filtered_words) / (doc_count[i] if doc_count[i] > 0 else 1)))
    return idf

In [69]:
def calculate_tf(filtered_words, dictionary_list):
    tf_matrix = numpy.zeros((len(filtered_words), len(dictionary_list)))
    for i, doc in enumerate(filtered_words):
        term_freq = {}
        for term in doc:
            if term in term_freq:
                term_freq[term] += 1
            else:
                term_freq[term] = 1
        max_freq = max(term_freq.values(), default=1)
        for j, term in enumerate(dictionary_list):
            tf_ij = term_freq.get(term, 0) / max_freq
            tf_matrix[i, j] = tf_ij
    return tf_matrix

In [70]:
def compute_matrix(filtered_words, dictionary_list):
    tf_matrix = calculate_tf(filtered_words, dictionary_list)
    idf = calculate_idf(filtered_words, dictionary_list)
    tfidf = numpy.zeros(tf_matrix.shape)
    for i, term in enumerate(dictionary_list):
        tfidf[:, i] = tf_matrix[:, i] * idf[term]
    return tfidf, idf

In [71]:
tfidf,idf=compute_matrix(filter_words,dictionary_list)

In [72]:
with open('/content/drive/My Drive/Colab Notebooks/matrix.txt', 'w', encoding='utf-8') as file:
    for row in tfidf:
        row_str = ','.join(format(value, '.4f') for value in row)
        file.write(row_str + '\n')

In [73]:
from collections import Counter
import json
def compute_top_words_by_category(data):
    category_top_words = {}
    data['tokens']=filter_words
    unique_categories = data['Category'].unique()
    for category in unique_categories:
        cat_data = data[data['Category'] == category]
        processed_words = []
        for tokens in cat_data['tokens']:
            processed_words.extend(tokens)
        word_frequency = Counter(processed_words)
        category_top_words[category] ={word: count for word, count in word_frequency.most_common(3)}
    return category_top_words

In [74]:
top_three_words = compute_top_words_by_category(data)
with open("/content/drive/My Drive/Colab Notebooks/frequency.json", "w") as outfile:
    json.dump(top_three_words, outfile, ensure_ascii=False, indent=4)

In [75]:
top_three_words

{'sport': {'said': 428, 'game': 353, 'win': 288},
 'business': {'said': 724, 'us': 377, 'year': 360},
 'tech': {'said': 757, 'use': 459, 'peopl': 427},
 'entertainment': {'film': 450, 'said': 386, 'year': 249},
 'politics': {'said': 996, 'mr': 726, 'would': 495}}

In [76]:
def compute_top_tfidf_by_category(data, tfidf_matrix, dictionary_list):
    category_dic = {}
    unique_categories = data['Category'].unique()
    for category in unique_categories:
        cat_index = data[data['Category'] == category].index
        category_matrix = tfidf_matrix[cat_index]
        req_average = numpy.mean(category_matrix, axis=0)
        mapped_list = [(value, index) for index, value in enumerate(req_average)]
        mapped_list.sort(key=lambda m: m[0], reverse=True)
        top_entries = mapped_list[:3]
        category_dic[category] = {dictionary_list[ind[1]]: ind[0] for ind in top_entries}
    return category_dic

In [77]:
category_dic=compute_top_tfidf_by_category(data,tfidf,dictionary_list)
with open("/content/drive/My Drive/Colab Notebooks/scores.json", "w") as outfile:
    json.dump(category_dic, outfile, ensure_ascii=False, indent=4)

In [78]:
category_dic

{'sport': {'game': 0.3572741464870881,
  'england': 0.31907434737608514,
  'win': 0.3074106799706846},
 'business': {'firm': 0.2891252868078186,
  'bank': 0.2697288199539767,
  'market': 0.26162908341553825},
 'tech': {'mobil': 0.34627148373030014,
  'phone': 0.3319065027131584,
  'softwar': 0.3152238172837377},
 'entertainment': {'film': 0.7216412939111394,
  'award': 0.4106447057087541,
  'star': 0.40803563438879187},
 'politics': {'labour': 0.4510503671418209,
  'elect': 0.4313731783204545,
  'mr': 0.42043597469422206}}