In [2]:
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm # for showing progress

import time

In [25]:
# initializing dataset
data = pd.read_csv("training.csv")

In [9]:
#preliminary analysis


def sort_dict(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse = True)}

def add_col(df, name):
    # add a new column with zeroes
    df[name] = pd.Series(0, index=df.index)
    return df

def top_entries(d, n):
    # return n top entries in dictionary
    d = sort_dict(d)
    return list(d.items())[:20]

def normalize_occurences(word_occurences, articles):
    # convert total occurences to percentage occurence
    return {key : round(value / articles.shape[0], 2) for (key, value) in word_occurences.items()}

def get_top_words(data, topic, n):
    articles = data[data['topic'] == topic]
    
    word_count = {} #map each word to how often it appears in the topic's articles
    
    word_occurences = {} # map each word to how many articles it appears in
    
    unique_words = set()
    
    for i in articles.index:
        words = data['article_words'][i].split(',')
        
        for word in words:
            word_count.setdefault(word, 0)
            word_count[word] += 1
            
        unique_words = set(words)
        unique_words = unique_words.union(unique_words)
        
        for word in unique_words:
            word_occurences.setdefault(word, 0)
            word_occurences[word] += 1
    
    word_occurences = normalize_occurences(word_occurences, articles)
            
    return top_entries(word_count, n), top_entries(word_occurences, n), unique_words

topics = set(data.topic.values)

all_unique_words = set()

# getting the top words by topic
for topic in topics:
    top_words_by_count, top_words_by_occurences, unique_words = get_top_words(data, topic, 20)
    
    all_unique_words = all_unique_words.union(unique_words)
    
    print()
    print(topic)
    print("top word counts")
    print(top_words_by_count)

    print()
    print("top occurences")
    print(top_words_by_occurences)
    print()



print("total words:", len(all_unique_words))




BIOGRAPHIES PERSONALITIES PEOPLE
top word counts
[('year', 249), ('presid', 198), ('yeltsin', 140), ('stat', 133), ('minist', 126), ('hospit', 124), ('offic', 122), ('lead', 121), ('told', 117), ('report', 113), ('work', 95), ('time', 95), ('fami', 93), ('peopl', 91), ('day', 91), ('polic', 91), ('polit', 90), ('doct', 86), ('party', 85), ('world', 83)]

top occurences
[('year', 0.73), ('told', 0.49), ('stat', 0.45), ('offic', 0.42), ('lead', 0.41), ('presid', 0.38), ('day', 0.35), ('time', 0.35), ('early', 0.34), ('report', 0.34), ('work', 0.34), ('world', 0.34), ('month', 0.31), ('wednesday', 0.31), ('hospit', 0.31), ('countr', 0.3), ('fami', 0.29), ('thursday', 0.29), ('peopl', 0.28), ('minist', 0.28)]


DEFENCE
top word counts
[('nato', 511), ('stat', 410), ('milit', 406), ('forc', 381), ('minist', 364), ('defend', 357), ('offic', 317), ('russia', 299), ('presid', 238), ('plan', 235), ('year', 233), ('told', 230), ('countr', 228), ('govern', 224), ('arm', 221), ('troop', 213), ('c

In [27]:
#pre-processing

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def shuffle(data):
    # reorder the data randomly, Google recommends this as a best practice
    return data.sample(frac=1).reset_index(drop=True)

def vectorize(data):
    v = TfidfVectorizer(
            preprocessor = lambda x: x, #the preprocessor is set to be the identity function (it does nothing)
            tokenizer = lambda x: x.split(','), #the tokenizer (which converts a string into individual words) splits a string at ','
            ngram_range = NGRAM_RANGE,
            token_mode = TOKEN_MODE
            min_df - MIN_DOCUMENT_FREQUENCY) # we decide to use unigrams and bigrams as the google guide suggests
    x_train =  v.fit_transform(data['article_words'])
    
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train)

# data = add_spaces(data)
def select_best_features(data):
    
    

data = shuffle(data)    
x_train = vectorize(data)
data = select_best_features(data)
data.head()

Unnamed: 0,article_number,article_words,topic,vector
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS,"(0, 293465)\t0.10372522328662358\n (0, 2111..."
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS,"(0, 293465)\t0.10372522328662358\n (0, 2111..."
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS,"(0, 293465)\t0.10372522328662358\n (0, 2111..."
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS,"(0, 293465)\t0.10372522328662358\n (0, 2111..."
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT,"(0, 293465)\t0.10372522328662358\n (0, 2111..."


In [2]:
print(data.head())
print(pd.get_dummies(data, columns=['topic']).head())

NameError: name 'data' is not defined

In [None]:

def tokenize(data):
    #convert comma-seperated string into indidual words aka tokens
    data['tokens'] = data['article_words'].map(lambda str: str.split(','))
    return data

def add_n_grams(data, n):
    # create a new column of n_grams of custom length
    # n_grams are a series of adjacent words of length n
    # e.g: The three grams of the sentence 'The mouse ran up the clock' are
    # 'The mouse ran', 'mouse ran up', 'ran up the', and 'up the clock'
    data[str(n) + "_grams"] = data['tokens'].map(lambda tokens: n_grams(tokens, n))
    return data
    
def n_grams(tokens, n):
    # get n grams from a list of tokens
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens) + 1 - n)]

def vectorize_n_gram(data, n_gram_name):
    
    
    return data

def to_vector(n_grams, codes):
    pass

def vectorize(data):
    tf = TfidfTransformer()
    print(tf.fit_transform(data['text']))
    return data

def gen_code_map(n_grams):
    codes = {}
    
    for i in range(n_grams.shape[0]):
        for n_gram in n_grams[i]:
            if n_gram not in codes:
                codes[n_gram] = len(codes)
    
    return codes
