In [78]:
import string
from collections import OrderedDict

def add_words(text, word_dict_empty):
    """
    This function strips out the punctuations within an article, and add
    words to the provided ORDERED dictionary if they are not already in there. 
    The values of the dictionary is ignored.
    """
    text = str(text).encode("ascii", errors="ignore").decode()              # removing non-ascii characters
    text = text.translate(str.maketrans('', '', string.punctuation))        # removing punctuations
    text = text.lower()                                                     # changing all to lowercase
    for word in text.split():                                               # use `split()` without arguments for multiple spaces
        word_dict_empty[word] = 0

def count_words(text, word_dict_empty):
    """
    This function counts the words in an article and returns a list of values 
    corresponding to the counts of word appeared in the article.
    """
    word_dict = word_dict_empty.copy()
    text = str(text).encode("ascii", errors="ignore").decode()              # removing non-ascii characters
    text = text.translate(str.maketrans('', '', string.punctuation))        # removing punctuations
    text = text.lower()                                                     # changing all to lowercase
    for word in text.split():                                               # use `split()` without arguments for multiple spaces
        word_dict[word] = word_dict[word] + 1
    return list(word_dict.values())

In [79]:
import pandas as pd
df = pd.read_csv("./Dataset/Kaggle/train.csv")
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [80]:
authors = set(list(df['author']))
print(len(authors))

df['full'] = df['title'] + ' ' + df['author'] + ' ' + df['text']
# df['full'] = df['title'] + ' ' + df['text']
str(df.loc[20795,:]['full']) \
    .encode("ascii", errors="ignore").decode() \
    .translate(str.maketrans('', '', string.punctuation)) \
    .lower()

4202


'rapper ti trump a poster child for white supremacy jerome hudson rapper t i unloaded on black celebrities who met with donald trump after the election saying they failed to challenge the president for disrespecting and degrading black voters during the campaign the atlanta   based artist told the   of the view thursday that he took issue with talk show host steve harvey   kanye west and football hall of famer and civil rights champion jim brown meeting with trump  before you stand and smile and say this is a good man and take pictures what about addressing the disrespect and disregard for our community that was done t i said adding and what about him being the poster child for white supremacy and standing for the people who look to devalue our lives the   star of the vh1 reality show t i  tiny the family hustle also defended fellow rapper snoop dogg who recently starred in a   music video that sees the doggystyle rapper pull a gun on and shoot a parody clown version of president donal

In [81]:
# testing
word_dict_test = OrderedDict()

add_words(df.loc[20795,:]['full'], word_dict_test)
print(len(word_dict_test))
print(word_dict_test)

185
OrderedDict([('rapper', 0), ('ti', 0), ('trump', 0), ('a', 0), ('poster', 0), ('child', 0), ('for', 0), ('white', 0), ('supremacy', 0), ('jerome', 0), ('hudson', 0), ('t', 0), ('i', 0), ('unloaded', 0), ('on', 0), ('black', 0), ('celebrities', 0), ('who', 0), ('met', 0), ('with', 0), ('donald', 0), ('after', 0), ('the', 0), ('election', 0), ('saying', 0), ('they', 0), ('failed', 0), ('to', 0), ('challenge', 0), ('president', 0), ('disrespecting', 0), ('and', 0), ('degrading', 0), ('voters', 0), ('during', 0), ('campaign', 0), ('atlanta', 0), ('based', 0), ('artist', 0), ('told', 0), ('of', 0), ('view', 0), ('thursday', 0), ('that', 0), ('he', 0), ('took', 0), ('issue', 0), ('talk', 0), ('show', 0), ('host', 0), ('steve', 0), ('harvey', 0), ('kanye', 0), ('west', 0), ('football', 0), ('hall', 0), ('famer', 0), ('civil', 0), ('rights', 0), ('champion', 0), ('jim', 0), ('brown', 0), ('meeting', 0), ('before', 0), ('you', 0), ('stand', 0), ('smile', 0), ('say', 0), ('this', 0), ('is', 

In [82]:
word_dict_empty = OrderedDict({})

for text in df['full']:
    add_words(text, word_dict_empty)
print('Done adding words')


Done adding words


In [83]:
print(len(word_dict_empty))
print(list(word_dict_empty.keys())[5000:5020])

176910
['developed', 'rapport', 'positive', 'nave', 'irresponsible', 'hopelessly', 'unprepared', 'experiences', 'leaned', 'gesturing', 'hands', 'laughing', 'easily', 'displays', 'warmth', 'revealing', 'singled', 'copenhagen', 'collapse', 'proc']


In [84]:
X = []
Y = list(df['label'])

for text in list(df['full'])[:5]:
    X.append(count_words(text, word_dict_empty))
print('Done counting words')

X_df = pd.DataFrame(X)
X_df

Done counting words


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,176900,176901,176902,176903,176904,176905,176906,176907,176908,176909
0,7,3,6,7,7,5,4,4,8,4,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
print(type(X[0]))
print(type(X[0][0]))

<class 'list'>
<class 'int'>


In [86]:
# testing

list_1 = [1, 2, 3, 5, 7]
list_2 = ['a', 'c', 'd', 'e', 'f']

print(type( list(zip(list_1, list_2))[0] ))

test_df = pd.DataFrame(list(zip(list_1, list_2)))
test_df

print(type(string.punctuation))
print('’' in string.printable)
print(ord(string.printable[-3]))

"didn’'t".encode("ascii", errors="ignore").decode()

<class 'tuple'>
<class 'str'>
False
13


"didn't"