**Importing Libraries**

In [231]:
import pandas as pd
import numpy as np

**Understanding Data**

In [232]:
df = pd.read_csv('24_train_1.csv')

In [233]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1429,sfa awaits report over mikoliunas the scottish...,sport
1,1896,parmalat to return to stockmarket parmalat th...,business
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport
3,2178,henman decides to quit davis cup tim henman ha...,sport
4,194,french suitor holds lse meeting european stock...,business


**Importing nltk libraries**

In [234]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Loading data from dictionary**

In [238]:
# Read the dictionary.txt file and create a set of valid words
with open('dictionary.txt', 'r') as file:
    dictionary_words = set(word.strip() for word in file)

In [239]:
print(dictionary_words)

{'univers', 'chelsea', 'credit', 'manchest', 'polic', 'premiership', 'attract', 'specul', 'britain', 'accord', 'site', 'alway', 'stand', 'women', 'higher', 'set', 'name', 'minut', 'appl', 'sunday', 'situat', 'children', 'owner', 'second', 'west', 'write', 'thursday', 'peter', 'appeal', 'travel', 'worri', 'mini', 'figur', 'attend', 'connect', 'describ', 'reject', 'warn', 'manufactur', 'forecast', 'similar', 'blog', 'european', 'best', 'budget', 'debat', 'domin', 'potenti', 'irish', 'thing', 'cup', 'version', 'rate', 'everi', 'summer', 'creat', 'howard', 'via', 'rock', 'seat', 'charl', 'zealand', 'file', 'product', 'due', 'happen', 'spent', 'measur', 'great', 'make', 'moment', '14', 'model', 'lot', 'old', 'estim', 'phone', 'address', 'shown', 'million', 'car', 'qualiti', 'final', 'handset', 'among', 'feel', 'sharehold', 'eight', 'action', 'begin', 'target', 'complet', 'air', 'futur', 'bill', 'everyon', 'compet', 'success', 'achiev', 'given', 'drive', 'accept', 'websit', 'director', 'rais

**Generate unigrams for each document**

In [241]:
import string

from nltk.corpus import stopwords
from nltk.stem.porter import *


# stemming tool from nltk
stemmer = PorterStemmer()
# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def get_tokens(text):
    # turn document into lowercase
    lowers = text.lower()
    # remove punctuations
    no_punctuation = lowers.translate(remove_punctuation_map)
    # tokenize document
    tokens = nltk.word_tokenize(no_punctuation)
    # remove stop words
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    # stemming process
    stemmed = [stemmer.stem(item) for item in filtered]
    # Filtering unigrams based on the dictionary
    valid_unigrams = [word for word in stemmed if word in dictionary_words]
    return valid_unigrams

In [242]:
# Apply the get_tokens function to every row of the 'Unigrams' column
df['Unigrams'] = df['Text'].apply(lambda x: get_tokens(x))

In [243]:
df.head()

Unnamed: 0,ArticleId,Text,Category,Unigrams
0,1429,sfa awaits report over mikoliunas the scottish...,sport,"[report, scottish, footbal, associ, refere, re..."
1,1896,parmalat to return to stockmarket parmalat th...,business,"[return, compani, went, account, hope, back, s..."
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport,"[arsen, arsen, hit, club, offer, new, contract..."
3,2178,henman decides to quit davis cup tim henman ha...,sport,"[decid, quit, davi, cup, great, britain, davi,..."
4,194,french suitor holds lse meeting european stock...,business,"[french, hold, meet, european, stock, market, ..."


**Visualising Results**

In [244]:
# Assuming 'df' is your DataFrame
unigrams_value = df.loc[4, 'Unigrams']

# Print or work with the accessed value
print(unigrams_value)

['french', 'hold', 'meet', 'european', 'stock', 'market', 'met', 'london', 'stock', 'exchang', 'specul', 'may', 'launch', 'cash', 'bid', 'chief', 'held', 'talk', 'boss', 'day', 'rival', 'put', 'forward', 'bid', 'case', 'german', 'exchang', 'said', 'held', 'talk', 'declin', 'comment', 'talk', 'end', 'friday', 'specul', 'german', 'may', 'rais', 'bid', 'offer', 'reject', 'place', 'fund', 'cash', 'bid', 'far', 'howev', 'bid', 'deal', 'either', 'would', 'creat', 'biggest', 'stock', 'market', 'oper', 'europ', 'second', 'biggest', 'world', 'new', 'york', 'stock', 'exchang', 'specul', 'would', 'use', 'friday', 'meet', 'opportun', 'take', 'grow', 'plan', 'domin', 'london', 'market', 'union', 'fear', 'job', 'would', 'move', 'london', 'success', 'work', 'council', 'concern', 'trade', 'could', 'manag', 'london', 'futur', 'news', 'agenc', 'report', 'union', 'sourc', 'say', 'german', 'also', 'said', 'market', 'oper', 'promis', 'move', 'london', 'bid', 'success', 'meanwhil', 'sharehold', 'fear', 'con

In [245]:
# Assuming 'df' is your DataFrame
unigrams_value = df.loc[4, 'Text']

# Print or work with the accessed value
print(unigrams_value)

french suitor holds lse meeting european stock market euronext has met with the london stock exchange (lse) amid speculation that it may be ready to launch a cash bid.  euronext chief jean-francois theodore held talks with lse boss clara furse the day after rival deutsche boerse put forward its own bid case. the german exchange said it had held  constructive  professional and friendly  talks with the lse. but euronext declined to comment after the talks ended on friday. speculation is mounting that the germans may raise their bid to £1.5bn. deutsche boerse previously offered £1.3bn  which was rejected by the lse  while euronext is rumoured to have facilities in place to fund a £1.4bn cash bid. so far  however  neither have tabled a formal bid. but a deal with either bidder would create the biggest stock market operator in europe and the second biggest in the world after the new york stock exchange.  there was speculation euronext would use friday s meeting as an opportunity to take adv

In [246]:
# Assuming 'df' is your DataFrame
unigrams_value = df.loc[4, 'Unigrams']

# Print or work with the accessed value
print(unigrams_value)

['french', 'hold', 'meet', 'european', 'stock', 'market', 'met', 'london', 'stock', 'exchang', 'specul', 'may', 'launch', 'cash', 'bid', 'chief', 'held', 'talk', 'boss', 'day', 'rival', 'put', 'forward', 'bid', 'case', 'german', 'exchang', 'said', 'held', 'talk', 'declin', 'comment', 'talk', 'end', 'friday', 'specul', 'german', 'may', 'rais', 'bid', 'offer', 'reject', 'place', 'fund', 'cash', 'bid', 'far', 'howev', 'bid', 'deal', 'either', 'would', 'creat', 'biggest', 'stock', 'market', 'oper', 'europ', 'second', 'biggest', 'world', 'new', 'york', 'stock', 'exchang', 'specul', 'would', 'use', 'friday', 'meet', 'opportun', 'take', 'grow', 'plan', 'domin', 'london', 'market', 'union', 'fear', 'job', 'would', 'move', 'london', 'success', 'work', 'council', 'concern', 'trade', 'could', 'manag', 'london', 'futur', 'news', 'agenc', 'report', 'union', 'sourc', 'say', 'german', 'also', 'said', 'market', 'oper', 'promis', 'move', 'london', 'bid', 'success', 'meanwhil', 'sharehold', 'fear', 'con

**Calculating TFIDF**

In [250]:
def calculating_idf_per_words(df, words):
    ##Mapping of word with IDF scores.
    idf = {}

    ##Freq of each word by document.
    for word in words:
        idf[word] = 0
        for unigrams in df['Unigrams']:
            if word in unigrams:
                idf[word]+=1
            else:
                continue

    ### Converting freq to idf.
    for word in idf.keys():
        try:
            idf[word] = np.log(df.shape[0]/idf[word])
        except:
            print(word)

    return idf

word_idf = calculating_idf_per_words(df, dictionary_words)
# word_idf

In [251]:
def calculate_word_freq(unigrams):
    ##Mapping of each unique term with
    tf = {}
    unique_words = list(set(unigrams))
    for word in unique_words:
        tf[word] = unigrams.count(word)

    maxvalue = max(tf.values())
    for k in tf.keys():
        tf[k] /= maxvalue

    return tf

In [252]:
### Adding tf column to df
df['tf'] = df.Unigrams.apply(calculate_word_freq)

In [256]:
### Creating a matrix of tfidf scores
def calculate_tfidf():
    tfidf_scores = np.zeros((1000,1000))
    for j, word in enumerate(dictionary_words):
        for i, tf_dict in enumerate(df['tf']):
                tfidf_scores[i][j] = tf_dict.get(word, 0)*word_idf.get(word, 0)

    return tfidf_scores

tfidf = calculate_tfidf()

In [257]:
### Save Tfidf scores in txt file
np.savetxt("matrix.txt", tfidf, delimiter=",")

**Calculate for each category top 3 most frequest terms and top 3 highest tfidf scores**

In [258]:
def get_term_freq(unigrams):
    ##calculating term freq
    tf = {}
    unique_words = list(set(unigrams))
    for word in unique_words:
        tf[word] = unigrams.count(word)
    return tf
df['term_freq'] = df.Unigrams.apply(get_term_freq)

def get_top_frequest_words():

    freq_terms = {key:{} for key in df.Category.unique()}

    for key in freq_terms: #For each category

        merged_term_freq = {}
        df_key = df[df.Category == key] #Fetching records of that category.

        for term_freq in df_key['term_freq']: #term freq of unigrams in the record category
            for word, freq in term_freq.items():
                if word in merged_term_freq:
                    merged_term_freq[word]+=freq
                else:
                    merged_term_freq[word] = freq

        top_3_freq = sorted(list(merged_term_freq.items()), key=lambda x: x[1], reverse=True)[:3]
        freq_terms[key] = dict(top_3_freq)
    return freq_terms

top_freq_by_category = get_top_frequest_words()

In [259]:
import json
with open('frequency.json', 'w') as json_file:
    json.dump(top_freq_by_category, json_file)

**Calculating Avg TFIDF**

In [263]:
def get_top_avg_tfidf():

    avg_tfidf = {k:{} for k in df.Category.unique()}
    for category in df.Category.unique():
        rows = df[df.Category == category].index
        avg = np.mean(tfidf[rows], axis = 0)
        word_score_mapping = [(word, avg[i]) for i, word in enumerate(dictionary_words)]
        avg_tfidf[category] = dict(sorted(word_score_mapping, key=lambda x: x[1], reverse=True)[:3])

    return avg_tfidf


top_avg_tfidf = get_top_avg_tfidf()

In [264]:
with open('scores.json', 'w') as json_file:
    json.dump(top_avg_tfidf, json_file)