In [34]:
from collections import Counter

import pandas as pd
import regex as re
from stop_words import get_stop_words
import numpy as np
from sklearn.naive_bayes import MultinomialNB

stop_words = get_stop_words('english')

In [35]:
training_data_filepath = 'data/train.csv'
test_data_filepath = 'data/test.csv'
train_dataframe = pd.read_csv(training_data_filepath)
test_dataframe = pd.read_csv(test_data_filepath)
print(train_dataframe.shape, test_dataframe.shape)

(4356, 7) (484, 6)


In [36]:
#print(train_dataframe['title'])
print(train_dataframe.iloc[0])
a = re.findall(r'\w+', train_dataframe.iloc[0]['title'])
print(a)


article_id                                                    1
title         Forex - Pound drops to one-month lows against ...
url           http://www.nasdaq.com/article/forex-pound-drop...
publisher                                                NASDAQ
hostname                                         www.nasdaq.com
timestamp                                              1.39e+12
category                                                      4
Name: 0, dtype: object
['Forex', 'Pound', 'drops', 'to', 'one', 'month', 'lows', 'against', 'euro']


In [37]:

X = np.random.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])

clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
print(clf.predict(X[2:3]))


[3]


In [38]:
def process_title(train_dataframe):
    # TODO: remove stop words
    # TODO: Normalize
    # Add number feature
    vocabulary_list = []
    for i in range(train_dataframe.shape[0]):
        a = re.findall(r'[A-Za-z]+', train_dataframe.iloc[i]['title'])
        a = [item.lower() for item in a if (len(item) > 1 and item.lower() not in stop_words)]
        vocabulary_list += a
    vocabulary_list = list(set(vocabulary_list))
    return vocabulary_list

title_vocabulary_list = process_title(train_dataframe)
print(len(title_vocabulary_list))
print(title_vocabulary_list[:100])

4547
['sort', 'evil', 'appeal', 'highway', 'abercrombie', 'suvs', 'bet', 'british', 'wsj', 'billion', 'saw', 'traders', 'tanks', 'relief', 'cabinet', 'buyout', 'upper', 'detected', 'smit', 'recalls', 'temporary', 'respiratory', 'hispanics', 'based', 'couple', 'ousts', 'ppaca', 'zero', 'greets', 'enforcement', 'inaccuracies', 'commerce', 'bromund', 'gun', 'actiev', 'toyota', 'last', 'majority', 'lulac', 'downs', 'packaged', 'planes', 'justice', 'fielded', 'ones', 'defaults', 'spending', 'roll', 'fade', 'interline', 'snub', 'cold', 'takeover', 'core', 'mutual', 'suddenly', 'seller', 'execute', 'loses', 'exec', 'send', 'kingston', 'latest', 'birthday', 'outfitters', 'dublin', 'tux', 'liability', 'ceo', 'better', 'cooperation', 'libyan', 'due', 'congress', 'made', 'right', 'choppy', 'boosting', 'cheese', 'hukou', 'win', 'hit', 'rental', 'words', 'eurosceptics', 'brent', 'guys', 'karpeles', 'back', 'bmo', 'unknown', 'signing', 'outpace', 'thumbs', 'increases', 'quiz', 'unlocks', 'plosser', 

In [39]:
def process_publisher(train_dataframe):
    vocabulary_list = []
    for i in range(train_dataframe.shape[0]):
        vocabulary_list.append(train_dataframe.iloc[i]['publisher'])
    vocabulary_list = list(set(vocabulary_list))
    return vocabulary_list

publisher_vocabulary_list = process_publisher(train_dataframe)
print(len(publisher_vocabulary_list))

1368


In [54]:
def vectorize_instance(instance, title_vocabulary_list, publisher_vocabulary_list):
    # TODO: make it compatible for all cases, now only for title + publisher?
    output_array = np.zeros((1, (len(title_vocabulary_list) + len(publisher_vocabulary_list))))
    
    a = re.findall(r'[A-Za-z]+', instance['title'])
    a = [item.lower() for item in a if (len(item) > 1 and item.lower() not in stop_words and item in title_vocabulary_list)]
    title_counter = Counter(a)
    for item in title_counter:
        output_array[0, title_vocabulary_list.index(item)] = title_counter[item]
    
    current_publisher = instance['publisher']
    if current_publisher in publisher_vocabulary_list:
        output_array[0, len(title_vocabulary_list) + publisher_vocabulary_list.index(current_publisher)] = 1
    return output_array

In [55]:
def vectorize_dataset(dataframe, title_vocabulary_list, publisher_vocabulary_list):
    # TODO: make it compatible for all cases, now only for title + publisher?
    output_array = np.zeros((dataframe.shape[0], (len(title_vocabulary_list) + len(publisher_vocabulary_list))))
    for i in range(dataframe.shape[0]):
        instance = dataframe.iloc[i]
        output_array[i, :] = vectorize_instance(instance, title_vocabulary_list, publisher_vocabulary_list)

    return output_array

In [56]:
train_Y = train_dataframe['category']
print(train_Y.shape)
train_X = vectorize_dataset(train_dataframe, title_vocabulary_list, publisher_vocabulary_list)
print(train_X.shape)

(4356,)
(4356, 5915)


In [59]:
test_Y = test_dataframe['category']
print(test_Y.shape)
test_X = vectorize_dataset(test_dataframe, title_vocabulary_list, publisher_vocabulary_list)
print(test_X.shape)

(484, 5915)
