In [3]:
import os
import sys
import string
import argparse
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabrielramonlorenzana/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stopWords = set(stopwords.words('english'))

In [5]:


def main():
    
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-f','--filepath',dest='filepath',metavar='file path',help='Path to text input file to be analysed.', required=True)
    parser.add_argument('-n','--number',dest='number',metavar='number',help='Most frequent n words will be displayed and plotted.', required=False, default=100, type=int)
    args = parser.parse_args()
    
    # Path to text file to analyse
    rawfilepath = args.filepath
    
    # Print a histogram containing the top N words, and print them and their counts.
    top_n = args.number
    
    # Load the file
    filepath = os.path.normpath(os.path.join(rawfilepath))
    file = open(filepath, 'r')
    
    # Parse as a list, removing lines
    content_sublists = [line.split(',') for line in file.readlines()]
    
    # Parse into a single list (from a list of lists)
    content_list = [item for sublist in content_sublists for item in sublist]
    
    # Remove whitespace so we can concatenate appropriately, and unify case
    content_list_strip = [str.strip().lower() for str in content_list]
    
    # Concatenate strings into a single string
    content_concat = ' '.join(content_list_strip)
    
    # Remove punctuation and new lines
    punct = set(string.punctuation)
    unpunct_content = ''.join(x for x in content_concat if x not in punct)
    
    # Split string into list of strings, again
    word_list = unpunct_content.split()
    
    # Perform count
    counts_all = Counter(word_list)
    
    words, count_values = zip(*counts_all.items())
    
    # Sort both lists by frequency in values (Schwartzian transform) - thanks, http://stackoverflow.com/questions/9543211/sorting-a-list-in-python-using-the-result-from-sorting-another-list
    values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True))
    
    # Top N
    words_sorted_top = words_sorted[0:top_n]
    values_sorted_top = values_sorted[0:top_n]
    
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    print("{0} unique words identified in the text file, {1}".format(len(values_sorted), filepath))
    print("The top {0} words are: \n{1}".format(top_n, words_sorted_top))
    print("... their respective frequencies: \n{0}".format(values_sorted_top))
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    # Pandas DataFrame just for visualisation
    df = pd.DataFrame({'count': values_sorted_top, 'word': words_sorted_top})
    print("{0}".format(df))
    sys.stdout.flush()
    
    # Histogram
    
    # Make xticklabels comprehensible by matplotlib
    xticklabels = unicode(list(words_sorted_top)).split()
    # Remove the single quotes, commas and enclosing square brackets
    xtlabs = [xstr.replace("'","").replace(",","").replace("]","").replace("[","") for xstr in xticklabels]
    
    indices = np.arange(len(words_sorted_top))
    width = 1
    fig = plt.figure()
    fig.suptitle('Word frequency histogram, top {0}'.format(top_n), fontsize=16)
    plt.xlabel('word', fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.bar(indices, values_sorted_top, width)
    plt.xticks(indices + width * 0.5, xtlabs, rotation='vertical', fontsize=8)
    plt.show()
    
if __name__ == '__main__':
    main()

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
19 unique words identified in the text file, /Users/gabrielramonlorenzana/Library/Jupyter/runtime/kernel-bd0b8f1f-9a81-42e9-8eb6-06bce6a28c59.json
The top 100 words are: 
('shellport', '49188', 'iopubport', '49190', 'stdinport', '49191', 'controlport', '49192', 'hbport', '49193', 'ip', '127001', 'key', 'f12af119b47c49bf897e11e212d3064d', 'transport', 'tcp', 'signaturescheme', 'hmacsha256', 'kernelname')
... their respective frequencies: 
(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    count                              word
0       1                         shellport
1       1                             49188
2       1                         iopubport
3       1                             49190
4       1                         stdinport
5       1                             49191
6       1                       controlport
7       1       

NameError: name 'unicode' is not defined

In [11]:
df = pd.read_csv('reliable.csv')
dfx

Unnamed: 0,Title,Text,Author,Source,Label
0,A Candidate Spurned,By the time they touched down in New Hampshire...,By MATT BAI,The New York Times,Reliable
1,Will China Trigger the Next Global Recession?,Will China Trigger the Next Global Recession? ...,Jing Jin,Kaggle,Reliable
2,The Results Are In: This Is the Best Sex Toy f...,PDF \nBy Jordan Gray | jordangrayconsulting.co...,,Kaggle,Reliable
3,The De Facto US/Al Qaeda Alliance : Informa...,The De Facto US/Al Qaeda Alliance\nExclusive:...,Robert Parry,Kaggle,Reliable
4,Obama Furious After Fed-Up ‘Deplorables’ Drop ...,Obama Furious After Fed-Up ‘Deplorables’ Drop ...,Amanda Shea,Kaggle,Reliable
5,Siri Can Open Smart Locks and Let Neighbors Wa...,Apple HomeKit home security security smart house,IT Ninja,Kaggle,Reliable
6,Hillary To Be Arrested?,"October 29, 2016 \nWrites Graham Dugas: \nCome...",Mike Rivero,Kaggle,Reliable
7,Saudi Arabia announces date certain for the ap...,"Email Saudi Arabia announces the VAT Act, whic...",,Kaggle,Reliable
8,Baking Soda & Coconut Oil Can Kill Cancer: Eye...,Humans Are Free Baking Soda & Coconut Oil Can ...,stevew,Kaggle,Reliable
9,"Anti-Trump protests are paid and staged, Craig...","November 12, 2016 348 Ads on Craigslist reveal...",Sergey Gladysh,Kaggle,Reliable


"'Men Are Not Oppressed,' Says Woman Who Has No Idea What It Like To Take Two Whole Escalators To Get To Your Clothi… https://t.co/MN9kpa5B4P"