In [29]:
import string
from collections import OrderedDict
from scipy.sparse import csr_matrix
from collections import Counter

def add_words(text, unique_words):
    """
    This function strips out the punctuations within an article, and add
    words to the provided ORDERED dictionary if they are not already in there. 
    The values of the dictionary is ignored.
    """
    text = str(text).encode("ascii", errors="ignore").decode()              # removing non-ascii characters
    text = text.translate(str.maketrans('', '', string.punctuation))        # removing punctuations
    text = text.lower()                                                     # changing all to lowercase
    for word in text.split():                                               # use `split()` without arguments for multiple spaces
        if len(word) > 1:
            unique_words.add(word)

def count_words(index, text, row, col, val, vocab):
    """
    This function counts the words in an article and append the values
    according to the sparse matrix format.
    """
    text = str(text).encode("ascii", errors="ignore").decode()              # removing non-ascii characters
    text = text.translate(str.maketrans('', '', string.punctuation))        # removing punctuations
    text = text.lower()                                                     # changing all to lowercase
    counter = {}
    for word in text.split():                                               # use `split()` without arguments for multiple spaces
        if len(word) > 1:
            counter[word] = counter[word] + 1 if word in counter else 0
    for word, count in counter.items():
        if len(word) > 1:
            row.append(index)
            col.append(vocab[word])
            val.append(count)

In [30]:
# Use pandas to load the data into a DataFrame, which is displayed below.

import pandas as pd
df = pd.read_csv("./datasets/train.csv")
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [31]:
authors = set(list(df['author']))
print(len(authors))

# Combining title, author and the text to a single field "full"
df['full'] = df['title'] + ' ' + df['author'] + ' ' + df['text']
# df['full'] = df['title'] + ' ' + df['text']

# Printing out an example
str(df.loc[20795,:]['full']) \
    .encode("ascii", errors="ignore").decode() \
    .translate(str.maketrans('', '', string.punctuation)) \
    .lower()

4202


'rapper ti trump a poster child for white supremacy jerome hudson rapper t i unloaded on black celebrities who met with donald trump after the election saying they failed to challenge the president for disrespecting and degrading black voters during the campaign the atlanta   based artist told the   of the view thursday that he took issue with talk show host steve harvey   kanye west and football hall of famer and civil rights champion jim brown meeting with trump  before you stand and smile and say this is a good man and take pictures what about addressing the disrespect and disregard for our community that was done t i said adding and what about him being the poster child for white supremacy and standing for the people who look to devalue our lives the   star of the vh1 reality show t i  tiny the family hustle also defended fellow rapper snoop dogg who recently starred in a   music video that sees the doggystyle rapper pull a gun on and shoot a parody clown version of president donal

In [32]:
# Testing with add_words() and the example article from above
word_dict_test = set()

add_words(df.loc[20795,:]['full'], word_dict_test)
print(len(word_dict_test))
print(word_dict_test)

182
{'skin', 'is', 'calling', 'video', 'dogg', 'say', 'was', 'sees', 'degrading', 'steve', 'manigault', 'look', 'responded', 'donald', 'personal', 'met', 'his', 'told', 'white', 'west', 'version', 'good', 'trump', 'he', 'between', 'failed', 'unloaded', 'voters', 'fact', 'based', 'have', 'even', 'scrotum', 'for', 'by', 'also', 'wig', 'reality', 'vh1', 'her', 'doggystyle', 'devalue', 'rapper', 'protect', 'pull', 'saying', 'disrespect', 'before', 'protected', 'talk', 'take', 'ti', 'jeromeehudson', 'heroes', 'lives', 'meeting', 'challenge', 'kanye', 'pictures', 'follow', 'hes', 'insult', 'parody', 'fellow', 'gap', 'and', 'atlanta', 'celebrities', 'on', 'standing', 'muskrat', 'message', 'supremacy', 'adding', 'the', 'it', 'whatever', 'view', 'people', 'after', 'family', 'tanned', 'him', 'star', 'done', 'which', 'intention', 'our', 'wearing', 'constitution', 'who', 'sunny', 'with', 'cant', 'has', 'man', 'disrespecting', 'sic', 'brown', 'advisor', 'tiny', 'omarosa', 'about', 'we', 'hudson', '

In [33]:
# add_words()

unique_words = set()
for text in df['full']:
    add_words(text, unique_words)

vocab = {}
for index, word in enumerate(sorted(unique_words)):
    vocab[word] = index
print('Done adding words')

Done adding words


In [34]:
# Printing out the length and a sample of words from the dictionary.

print(len(vocab))
print(list(vocab.items())[150000:150020])

176874
[('stapleton', 150000), ('stapling', 150001), ('star', 150002), ('staraj', 150003), ('starboard', 150004), ('starboards', 150005), ('starbucks', 150006), ('starcaps', 150007), ('starch', 150008), ('starched', 150009), ('starches', 150010), ('starchild', 150011), ('starchitect', 150012), ('starchitects', 150013), ('starchy', 150014), ('stardom', 150015), ('stardust', 150016), ('stare', 150017), ('stared', 150018), ('starej', 150019)]


In [35]:
# count_words
X = []
Y = list(df['label'])

row = []
col = []
val = []
for index, text in enumerate(df['full']):
    count_words(index, text, row, col, val, vocab)
print('Done counting words')

X = csr_matrix((val, (row, col)), shape=(len(df), len(vocab)))
print(X)

Done counting words
  (0, 2672)	0
  (0, 2721)	1
  (0, 3449)	0
  (0, 3638)	0
  (0, 3742)	0
  (0, 5904)	0
  (0, 7074)	0
  (0, 7167)	6
  (0, 7394)	0
  (0, 7612)	2
  (0, 7997)	0
  (0, 8091)	0
  (0, 8675)	0
  (0, 9075)	4
  (0, 9191)	0
  (0, 9583)	5
  (0, 10786)	1
  (0, 11316)	2
  (0, 11388)	1
  (0, 11904)	0
  (0, 12292)	4
  (0, 12458)	0
  (0, 12480)	14
  (0, 13020)	0
  (0, 13176)	0
  :	:
  (20799, 173348)	0
  (20799, 173409)	0
  (20799, 173422)	0
  (20799, 173440)	0
  (20799, 173495)	4
  (20799, 173503)	0
  (20799, 173504)	0
  (20799, 173591)	0
  (20799, 173664)	0
  (20799, 173670)	0
  (20799, 173717)	0
  (20799, 173944)	0
  (20799, 173976)	0
  (20799, 174002)	0
  (20799, 174015)	0
  (20799, 174031)	0
  (20799, 174171)	0
  (20799, 174172)	0
  (20799, 174176)	0
  (20799, 174237)	0
  (20799, 174846)	0
  (20799, 174875)	2
  (20799, 175349)	0
  (20799, 175378)	0
  (20799, 175409)	1


In [36]:
# testing

list_1 = [1, 2, 3, 5, 7]
list_2 = ['a', 'c', 'd', 'e', 'f']

print(type( list(zip(list_1, list_2))[0] ))

test_df = pd.DataFrame(list(zip(list_1, list_2)))
test_df

print(type(string.punctuation))
print('’' in string.printable)
print(ord(string.printable[-3]))

"didn’'t".encode("ascii", errors="ignore").decode()

<class 'tuple'>
<class 'str'>
False
13


"didn't"