In [1]:
import os, sys, re, json, itertools, operator
import pandas as pd, numpy as np, scipy as sp
from joblib import Memory
from bson import json_util
    
%load_ext cythonmagic       # '%' = magic function in ipython

In [2]:
data_directory = os.path.join('data')
output_directory = os.path.join(os.path.expanduser("~"), 'work', 'stockmeme', 'news_analysis')

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

joblib_cache = Memory(cachedir=os.path.join(output_directory, 'joblib'), verbose=0)

pd.set_option('notebook_repr_html', True)
pd.set_option('precision', 4)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 50)
pd.set_option('max_colwidth', 500)
pd.set_option('column_space', 100)
pd.set_option('use_inf_as_null', True)

#np.set_printoptions(linewidth=200, precision=15, suppress=True)

In [4]:
with open( os.path.join(data_directory, 'bloomberg.json') ) as f:
    bloomberg = json_util.loads(f.read())   #bloomberg = large dictionary (b/c json format)

df_bloomberg = pd.DataFrame(bloomberg)      #pandas can take json data & build data pandas 
df_bloomberg = df_bloomberg.drop('_id', axis=1); #drop 'name of colmumn'; axis = 1 specifies that '_id" is a column not a row (1 = row)
df_bloomberg = df_bloomberg.sort('insertion_date', ascending=False)
df_bloomberg = df_bloomberg.reset_index().drop('index', axis=1)
## drop, sort etc are pandas methods -- hence used on pandas obj 
df_bloomberg.head(10)

Unnamed: 0,ID,URL,created_at,feed_URL,insertion_date,text,title
0,MSQ4UR6JIJV801,http://www.bloomberg.com/news/2013-09-06/bank-of-america-settles-merrill-sex-discrimination-case.html,2013-09-07 04:01:01+00:00,http://www.bloomberg.com/news/industries/,2013-09-07 14:26:22.006000+00:00,"[Lawyers representing female financial advisers and licensed\ntrainees filed a request for approval of the settlement\nyesterday in federal court in Brooklyn , New York . The advisers\nalleged in the lawsuit that Merrill discriminated against women\nin those positions in compensation and business opportunities., “This settlement helps ensure that Merrill Lynch is a\nplace where women can thrive and be successful,” Cara E.\nGreene, a lawyer for the plaintiffs, said in a statement.\n“Hopefully...","Bank of America Settles Merrill Sex Discrimination Case Bank of America Corp. will pay $39\nmillion to settle a discrimination lawsuit brought on behalf of\nwomen employees at its Merrill Lynch unit, according to court\nfilings."
1,MSQ6GX0D9L3501,http://www.bloomberg.com/news/2013-09-07/fed-seen-set-to-taper-qe-even-as-payroll-gains-trail-forecasts.html,2013-09-07 04:00:01+00:00,http://www.bloomberg.com/news/markets/,2013-09-07 14:26:08.001000+00:00,"[Payrolls rose by 169,000 last month, a Labor Department\nreport showed yesterday in Washington , and the jobless rate fell\nto 7.3 percent as people left the workforce. Revisions to prior\nreports subtracted a total of 74,000 jobs to payrolls in the\nprevious two months, while hours worked and earnings rose., The jobs report is the last one Fed policy makers will see\nbefore their Sept. 17-18 meeting, when they resume a debate on\nwhen to pare $85 billion in monthly bond purchases. The incr...",Fed Seen Set to Taper QE Even as Payroll Gains Trail Forecasts
2,MSPBA90D9L3501,http://www.bloomberg.com/news/2013-09-07/germany-s-bonds-decline-as-ecb-s-draghi-fails-to-reverse-selloff.html,2013-09-07 06:00:00+00:00,http://www.bloomberg.com/news/markets/,2013-09-07 06:21:52.930000+00:00,"[Austrian, French and Dutch (GNTH10YR) 10-year yields all climbed to\nthe highest in more than a year as most government bond markets\nacross the euro region tumbled along with Treasuries. Draghi\nsaid the ECB “expects the key ECB interest rates to remain at\npresent or lower levels for an extended period of time,” at a\npress conference in Frankfurt on Sept. 5. Reports showed\nmanufacturing in the euro area and China expanded last month., “We’ve seen the European Central Bank ’s not willing...","Germany’s Bonds Decline as ECB’s Draghi Fails to Reverse Selloff Germany’s bonds fell this week,\npushing 10-year yields to a 17-month high, as European Central\nBank President Mario Draghi failed to convince investors\ninterest rates will remain low amid global economic growth."
3,MSPGO21A1I4H01,http://www.bloomberg.com/news/2013-09-07/gilts-decline-in-longest-streak-for-six-years-on-recovery-bets.html,2013-09-07 06:00:00+00:00,http://www.bloomberg.com/news/markets/,2013-09-07 06:21:51.640000+00:00,"[Sterling rose for the first time in three weeks against the\ndollar as a report showed the U.S. economy added fewer jobs in\nAugust than economists projected. It also advanced against the\neuro. Ten-year yields climbed above 3 percent for the first time\nsince July 2011 after the Bank of England’s Monetary Policy\nCommittee left its stimulus program unchanged and kept its\nbenchmark rate at a record low., “An increase in yields appears to be justified by the\nrecent domestic data and events...","Gilts Decline in Longest Streak for Six Years on Recovery Bets Gilts declined for a seventh week,\nthe longest streak in more than six years, as reports showed\nU.K. manufacturing and services industries expanded last month,\nadding to evidence of accelerating growth."
4,MSPYP46S973201,http://www.bloomberg.com/news/2013-09-07/dollar-rises-versus-euro-yen-on-taper-bets-amid-syria-tension.html,2013-09-07 04:00:11+00:00,http://www.bloomberg.com/news/markets/,2013-09-07 04:26:23.808000+00:00,"[The greenback reached a eight-week high versus the 17-nation currency as the prospect for U.S. intervention in Syria increased demand for safer assets. The Australian and New\nZealand dollars gained as China ’s service industry expanded last\nmonth. The U.S. currency pared gains versus the euro and the yen\nyesterday after U.S. employers added fewer workers than forecast\nin August, even as surveys showed economists’ retained Fed taper\nprojections. U.S. retail sales growth quickened last m...","Dollar Rises Versus Euro, Yen on Taper Bets Amid Syria Tension The dollar gained versus the euro\nand yen on speculation the Federal Reserve will trim the pace of\nits $85 billion in monthly bond purchases this month."
5,MSQB4E6JIJVD01,http://www.bloomberg.com/news/2013-09-07/apple-said-to-be-close-to-clinching-carrier-holdouts.html,2013-09-07 00:09:50+00:00,http://www.bloomberg.com/news/industries/,2013-09-07 00:29:55.088000+00:00,"[On the eve of an event on Sept. 10 to unveil new iPhones,\nApple and China Mobile are near a deal for the world’s largest\nwireless carrier to offer the device, said a person with\nknowledge of the matter, who asked not to be identified because\nthe discussions are private. Apple is also close to an agreement\nto distribute its handset through Japan ’s largest mobile\ncarrier, NTT DoCoMo, people with knowledge of the situation have\nsaid., Any final deals will bust loose at least 800 millio...","Apple Said to Be Close to Clinching Carrier Holdouts Apple Inc. is on the verge of\nclinching agreements with two of the biggest holdouts to selling\nits iPhone, China Mobile Ltd. and NTT DoCoMo Inc., opening a\nroute to hundreds of millions of new customers."
6,MSP6SQ6JTSE801,http://www.bloomberg.com/news/2013-09-06/everbright-errant-china-trades-lead-to-85-million-monthly-loss.html,2013-09-06 16:00:00+00:00,http://www.bloomberg.com/news/finance/,2013-09-07 00:07:52.110000+00:00,"[The country’s seventh-largest brokerage by market value\nincurred the unconsolidated loss, which doesn’t include\ncontributions from its subsidiaries, after disposing of assets\nfollowing the Aug. 16 error, Everbright said in a statement to\nthe Shanghai Stock Exchange yesterday. The figure is based on\npreliminary data and hasn’t been audited, the company said., The loss adds to woes at state-controlled Everbright, which\nhas been ordered by the China Securities Regulatory Commission\nto p...",Everbright Errant China Trades Lead to $85 Million Monthly Loss Everbright Securities Co. posted an\nunconsolidated net loss of 523 million yuan ($85 million) last\nmonth after $3.8 billion in erroneous trading orders that roiled\nChina’s equity market and drew a record regulatory penalty.
7,MSQ94Q6JTSEE01,http://www.bloomberg.com/news/2013-09-07/florida-ferris-wheel-deal-prompts-real-estate-sale-suit.html,2013-09-07 00:02:52+00:00,http://www.bloomberg.com/news/industries/,2013-09-07 00:07:44.758000+00:00,"[Lawyers for Great Wheel Beteliligungs GMBH & Co. KG, formed\nto build a giant Ferris wheel in Berlin, contends the developer\nof the defunct U.S. attraction wants to improperly withhold more\nthan $1.2 million from the sale of the site. The German company,\nwhich says it invested $63 million in the Orlando, Florida,\nproject, wants Delaware Chancery Court Judge Travis Laster to\nappoint a receiver to monitor the sale., The sale of the Great Orlando Wheel Corp. ’s assets “would\nresult in a ...","Florida Ferris Wheel Deal Prompts Real Estate Sale Suit An investor in a failed bid to build\na 400-foot-high Ferris wheel in Florida is asking a judge to\nappoint an independent monitor to oversee the sale of the\nattraction’s proposed site, according to an unsealed court\nfiling."
8,MOX13T6K50XT01,http://www.bloomberg.com/news/2013-09-06/smithfield-receives-u-s-regulator-approval-for-shuanghui-deal.html,2013-09-06 23:37:11+00:00,http://www.bloomberg.com/news/markets/,2013-09-06 23:39:58.747000+00:00,"[“This transaction will create a leading global animal\nprotein enterprise,” Zhijun Yang, Chief Executive Officer of\nShuanghui International, said in a joint statement from both\ncompanies released yesterday., The Committee on Foreign Investment in the U.S., or CFIUS,\napproved the transaction and it will be voted on by Smithfield\nshareholders at the company’s annual meeting Sept. 24. The\ngovernment of Ukraine also approved the deal, according to the\nstatement., Shuanghui, based in Hong ...","Smithfield Receives U.S. Approval for Biggest Chinese Takeover Smithfield Foods Inc., the world’s\nlargest hog and pork producer, said U.S. regulators will allow\nthe company to be bought by China’s Shuanghui International\nHoldings Ltd. in what would be the biggest Chinese purchase of a\nU.S. firm."
9,MSQ6KQ6JIJUY01,http://www.bloomberg.com/news/2013-09-06/malkins-reject-empire-state-building-s-buyout-proposals.html,2013-09-06 22:46:52+00:00,http://www.bloomberg.com/news/industries/,2013-09-06 23:07:03.555000+00:00,"[The company reviewed the proposals to purchase control of\nthe Empire State Building or its operating lease positions and\ndetermined that an initial public offering is in investors’ best\ninterest, according to a regulatory filing today. Lazard Ltd.\nacted as the Malkins’ independent adviser., Bidders including Rubin Schron ’s Cammeby’s International\nand Joseph Sitt ’s Thor Equities LLC made unsolicited offers for\nthe Empire State Building as an alternative to the planned IPO.\nMalkin Ho...","Malkins Reject Empire State Building’s Buyout Proposals Malkin Holdings LLC, the supervisor\nof New York’s Empire State Building, is proceeding with a plan\nto form a publicly traded real estate investment trust after\nreceiving buyout offers for the tower."


In [5]:
with open( os.path.join(data_directory, 'reuters.json') ) as f:
    reuters = json_util.loads(f.read())

df_reuters = pd.DataFrame(reuters)
df_reuters = df_reuters.drop('_id', axis=1);
df_reuters = df_reuters.drop('source', axis=1);
df_reuters = df_reuters.sort('insertion_date', ascending=False)
df_reuters = df_reuters.reset_index().drop('index', axis=1)

df_reuters.head(10)

Unnamed: 0,ID,URL,created_at,feed_URL,insertion_date,tags,text,title
0,http://www.reuters.com/article/2013/09/07/film-toronto-festival-prisoners-idUSL2N0H308X20130907,http://www.reuters.com/article/2013/09/07/film-toronto-festival-prisoners-idUSL2N0H308X20130907,2013-09-07 14:09:11+00:00,http://feeds.reuters.com/news/usmarkets,2013-09-07 14:35:16.164000+00:00,"[{u'term': u'marketsNews', u'scheme': None, u'label': None}]","[TORONTO, Sept 7 - Forget for a minute the Hugh\nJackman of Broadway musicals and ""Les Miserables"" and ""X-Men""\nmovies, and envision the Australian actor as a Pennsylvania\nsurvivalist and desperate father who takes justice into his own\nhands., As a carpenter without enough work who stockpiles supplies\nand doesn't trust government, he is the seething vigilante who\ndrives the dark thriller ""Prisoners,"" one of the most\ntalked-about films at the Toronto International Film Festival\nand th...",'Prisoners' captures rage and despair of a vulnerable America at Toronto film fest
1,http://www.reuters.com/article/2013/09/07/cernobbio-saccomanni-idUSL5N0H30A920130907,http://www.reuters.com/article/2013/09/07/cernobbio-saccomanni-idUSL5N0H30A920130907,2013-09-07 13:50:56+00:00,http://feeds.reuters.com/reuters/bondsNews,2013-09-07 14:04:41.071000+00:00,"[{u'term': u'bondsNews', u'scheme': None, u'label': None}]","[* Warns of ""disastrous loss of credibility"" if deficit\ntarget missed, * Berlusconi party pledges ""responsibility"" ahead of key\nSenate vote, By Giancarlo Navach]",UPDATE 1-Minister says Italy will dodge political crisis
2,http://www.reuters.com/article/2013/09/07/telecomitalia-sawiris-idUSL5N0H30CG20130907,http://www.reuters.com/article/2013/09/07/telecomitalia-sawiris-idUSL5N0H30CG20130907,2013-09-07 13:11:33+00:00,http://feeds.reuters.com/reuters/cyclicalconsumergoodsNews,2013-09-07 13:22:29.240000+00:00,"[{u'term': u'cyclicalConsumerGoodsSector', u'scheme': None, u'label': None}]","[By Paola Arosio, MILAN, Sept 7 (Reuters) - Egyptian telecoms tycoon Naguib\nSawiris said on Saturday he was still interested in taking a\nstake in Telecom Italia but might be discouraged if\nthe Italian government was opposed., In an emailed statement to Reuters, Sawiris said he was\nconsidering a ""potential investment in Telecom Italia"" but might\nnot submit a proposal ""in view of (Italy's)... alleged\npreference for Telefonica reported in the Italian press.""]","UPDATE 1-Sawiris eyes Telecom Italia, but concerned about Rome"
3,http://www.reuters.com/article/2013/09/07/usa-philadelphia-schools-idUSL2N0GO1CY20130907,http://www.reuters.com/article/2013/09/07/usa-philadelphia-schools-idUSL2N0GO1CY20130907,2013-09-07 13:00:00+00:00,http://feeds.reuters.com/reuters/bondsNews,2013-09-07 13:20:11.633000+00:00,"[{u'term': u'bondsNews', u'scheme': None, u'label': None}]","[* City pensions compete with schools for extra cash, * Plans to sell shuttered schools, By Hilary Russ]",Fiscal crisis looms for Philadelphia schools as students return
4,http://www.reuters.com/article/2013/09/07/china-politics-ring-idUSL4N0H306B20130907,http://www.reuters.com/article/2013/09/07/china-politics-ring-idUSL4N0H306B20130907,2013-09-07 11:09:50+00:00,http://feeds.reuters.com/reuters/cyclicalconsumergoodsNews,2013-09-07 12:53:21.464000+00:00,"[{u'term': u'cyclicalConsumerGoodsSector', u'scheme': None, u'label': None}]","[The official Xinhua news agency put out a one-line report\nstating: ""Talk on the Internet about Xi Jinping wearing a\nwedding ring at the G20 summit is fake information."" It provided\nno further explanation., The denial comes as China embarks on yet another crackdown\non what it terms ""online rumours"", as the control-obsessed\ngovernment tries once again to rein in social media., Xinhua's report appeared to stem from a lingering shot\ncarried on state television earlier in the week of Xi's ...",Chinese left confused by bizarre missive on Xi's ring
5,http://www.reuters.com/article/2013/09/07/australia-election-idUSL4N0H307Q20130907,http://www.reuters.com/article/2013/09/07/australia-election-idUSL4N0H307Q20130907,2013-09-07 12:34:32+00:00,http://feeds.reuters.com/news/usmarkets,2013-09-07 12:53:04.493000+00:00,"[{u'term': u'marketsNews', u'scheme': None, u'label': None}]","[* Australian conservatives win power, * Defeated Rudd quits as Labor leader, * Voters punish Labor party for political disunity]",RPT-UPDATE 5-Conservative leader Abbott sweeps into power in Australian elections
6,http://www.reuters.com/article/2013/09/07/india-finmin-imports-idUSL4N0H306M20130907,http://www.reuters.com/article/2013/09/07/india-finmin-imports-idUSL4N0H306M20130907,2013-09-07 11:34:40+00:00,http://feeds.reuters.com/reuters/financialsNews,2013-09-07 12:12:56.359000+00:00,"[{u'term': u'financialsSector', u'scheme': None, u'label': None}]","[The government has already banned duty-free import of\nflat-screen televisions from Aug. 26, adding to a package of\nmeasures designed to prop up the rupee., ""I hope that all these measures taken together will have a\nbeneficial impact on inflation,"" Chidambaram told lawmakers., He added that no decision has been taken to raise fuel\nprices.]",India to unveil new measures to curb inessential imports
7,http://www.reuters.com/article/2013/09/07/us-weather-hurricanes-idUSBRE9860AY20130907,http://www.reuters.com/article/2013/09/07/us-weather-hurricanes-idUSBRE9860AY20130907,2013-09-07 12:04:04+00:00,http://feeds.reuters.com/reuters/environment,2013-09-07 12:12:56.359000+00:00,"[{u'term': u'environmentNews', u'scheme': None, u'label': None}]","[MIAMI - The 2013 Atlantic hurricane season, which forecasters had predicted would be more active than normal, has turned out to be something of a dud so far as an unusual calm hangs over the tropics., As the season heads into the historic peak for activity, it may even enter the record books as marking the quietest start to any Atlantic hurricane season in decades., ""It certainly looks like pretty much of a forecast bust,"" said Jeff Masters, a hurricane expert and director of meteorology a...",Atlantic hurricane season - a record-breaking dud?
8,http://www.reuters.com/article/2013/09/07/cernobbio-intesa-idUSL5N0H306K20130907,http://www.reuters.com/article/2013/09/07/cernobbio-intesa-idUSL5N0H306K20130907,2013-09-07 11:49:29+00:00,http://feeds.reuters.com/reuters/financialServicesrealEstateNews,2013-09-07 12:12:37.609000+00:00,"[{u'term': u'rbssFinancialServicesAndRealEstateNews', u'scheme': None, u'label': None}]","[By Lisa Jucca and Paola Arosio, CERNOBBIO, Italy, Sept 7 (Reuters) - The chief executive of\nIntesa Sanpaolo said on Saturday he expected a new\nphase of mergers in the banking sector, both inside Italy and\nacross Europe., Enrico Cucchiani also said his bank was not interested in\nincreasing its exposure to Italy's banking system.]",INTERVIEW-Intesa CEO expects bank mergers in Europe
9,http://www.reuters.com/article/2013/09/07/weather-hurricanes-idUSL2N0H21LR20130907,http://www.reuters.com/article/2013/09/07/weather-hurricanes-idUSL2N0H21LR20130907,2013-09-07 12:00:00+00:00,http://feeds.reuters.com/reuters/financialServicesrealEstateNews,2013-09-07 12:12:37.609000+00:00,"[{u'term': u'rbssFinancialServicesAndRealEstateNews', u'scheme': None, u'label': None}]","[MIAMI, Sept 7 - The 2013 Atlantic hurricane\nseason, which forecasters had predicted would be more active\nthan normal, has turned out to be something of a dud so far as\nan unusual calm hangs over the tropics., As the season heads into the historic peak for activity, it\nmay even enter the record books as marking the quietest start to\nany Atlantic hurricane season in decades., ""It certainly looks like pretty much of a forecast bust,""\nsaid Jeff Masters, a hurricane expert and director of...",Atlantic hurricane season - a record-breaking dud?


In [6]:
df_bloomberg.text[1]            #.text --> text = column header b/c of dot; indexing to first element

[u'Payrolls rose by 169,000 last month, a Labor Department\nreport showed yesterday in Washington , and the jobless rate fell\nto 7.3 percent as people left the workforce. Revisions to prior\nreports subtracted a total of 74,000 jobs to payrolls in the\nprevious two months, while hours worked and earnings rose.',
 u'The jobs report is the last one Fed policy makers will see\nbefore their Sept. 17-18 meeting, when they resume a debate on\nwhen to pare $85 billion in monthly bond purchases. The increase\nin employment was probably strong enough to convince central\nbankers to reduce so-called quantitative easing by about $10\nbillion a month, said Stuart Hoffman, chief economist at PNC\nFinancial Services Group in Pittsburgh.',
 u'\u201cMaybe they do this taper lite,\u201d said Hoffman, the top\nforecaster of private payroll growth for the past two years,\naccording to data compiled by Bloomberg. \u201cThe headcount\u2019s weak,\nbut the income earned from wages and longer hours is posit

In [7]:
df_reuters.text[0]

[u'TORONTO, Sept 7  - Forget for a minute the Hugh\nJackman of Broadway musicals  and "Les Miserables" and "X-Men"\nmovies, and envision the Australian actor as a Pennsylvania\nsurvivalist and desperate father who takes justice into his own\nhands.',
 u'As a carpenter without enough work who stockpiles supplies\nand doesn\'t trust government, he is the seething vigilante who\ndrives the dark thriller "Prisoners," one of the most\ntalked-about films at the Toronto International Film Festival\nand the subject of early Oscar buzz.',
 u'Playing opposite Jake Gyllenhaal\'s small town detective,\nJackman\'s Keller Dover embodies what French Canadian director\nDenis Villeneuve calls a "lack of confidence in the\ninstitutions."']

In [8]:
import nltk
from nltk.tag.stanford import StanfordTagger
from nltk.collocations import *
from text.blob import TextBlob

In [9]:
articles = df_bloomberg.text.tolist()     #.tolist() --> changes type from str to list
text = '  '.join(sentence for sentence in articles[0])  #join sentences into 1 string sep by dbl space
#text = text.encode("utf-8")

In [10]:
def find_entities(chunks):
    "given list of tagged parts of speech, returns unique named entities"

    def traverse(tree):
        "recursively traverses an nltk.tree.Tree to find named entities"
        entity_names = []
    
        if hasattr(tree, 'node') and tree.node:
            print tree.node
            if tree.node == 'GPE':
                entity_names.append(' '.join([child[0] for child in tree]))
            elif tree.node == 'PERSON':
                entity_names.append(' '.join([child[0] for child in tree]))
            else:
                for child in tree:
                    entity_names.extend(traverse(child))
    
        return entity_names
    
    named_entities = []
    
    for chunk in chunks:
        entities = sorted(list(set([word for tree in chunk
                            for word in traverse(tree)])))
        for e in entities:
            if e not in named_entities:
                named_entities.append(e)
    return named_entities

In [11]:
 ## takes text and associate w. POS tags (so can filter on basis of tags)

tokens = nltk.word_tokenize(text)            
sentences = nltk.sent_tokenize(text)
words     = (nltk.word_tokenize(sentence) for sentence in sentences)
tags       = [nltk.pos_tag(word) for word in words]

named_entity_chunks = nltk.batch_ne_chunk(tags)
find_entities(named_entity_chunks)

GPE
GPE
PERSON
PERSON
PERSON
PERSON


[u'Brooklyn', u'New York', u'Merrill', u'Cara E. Greene', u'Merrill Lynch']

In [12]:
# http://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity/12128777
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [14]:
sentences[:10]

[u'Lawyers representing female financial advisers and licensed\ntrainees filed a request for approval of the settlement\nyesterday in federal court in Brooklyn , New York .',
 u'The advisers\nalleged in the lawsuit that Merrill discriminated against women\nin those positions in compensation and business opportunities.',
 u'\u201cThis settlement helps ensure that Merrill Lynch is a\nplace where women can thrive and be successful,\u201d Cara E.\nGreene, a lawyer for the plaintiffs, said in a statement.',
 u'\u201cHopefully others will follow Merrill Lynch\u2019s example.\u201d  The accord, which is subject to approval by a federal\njudge, covers about 4,800 women who worked at the firm during\nthe period of Aug. 2, 2007 to Sept. 15, 2013, according to\ndocuments filed with the plaintiffs\u2019 motion.',
 u'Along with a cash fund, the settlement would require\nchanges at the firm to be overseen by an independent monitor,\naccording to the plaintiffs\u2019 attorneys.']

In [15]:
tfidf = TfidfVectorizer(ngram_range=(1,3), token_pattern=r'\b\w+\b', min_df=1).fit_transform(cleaned_text) # need to use character n-grams

NameError: name 'cleaned_text' is not defined

In [22]:
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()

related_docs_indices = cosine_similarities.argsort()[:-20:-1]
related_docs_indices

array([   0,    8, 3442, 2201,   63,  320, 1371, 1363, 3218, 4599, 3386,
       6512, 6515, 4816,  603, 5771,    9, 2687, 2142])

In [23]:
cosine_similarities[related_docs_indices]

array([ 1.        ,  0.43069066,  0.28236834,  0.17565869,  0.17336819,
        0.152429  ,  0.15153243,  0.15124149,  0.15110318,  0.14708165,
        0.14708165,  0.14627424,  0.14622723,  0.14495305,  0.13674133,
        0.1356015 ,  0.12915598,  0.1270037 ,  0.12179412])

In [24]:
for i in related_docs_indices:
    print cleaned_text[i]

AT_USER $aapl. apple's iphone has cracked.
apple's iphone has cracked the last two major airtime provider holdouts ~ URL $aapl
report: apple’s iphone has over 43% of the u.s. market $aapl URL
$aapl docomo to offer apple's iphone: nikkei URL
features of latest apple’s iphone leaked URL $aapl
apple's iphone holds 40% share of us smartphone market $aapl URL
apple's iphone 5c, a detonator in global smartphone market $aapl URL
AT_USER apple's iphone 5c, a detonator in global smartphone market $aapl URL
a shareholders guide to apple's iphone trade-in program $aapl URL
AT_USER $aapl
AT_USER $aapl
AT_USER apple's iphone grows to 43% share among us smartphones $aapl URL
apple's iphone grows to 43% share among us smartphones $aapl URL
AT_USER $aapl apple sep 11 event in beijing
$aapl apple’s low-cost iphones shipping to china mobile URL
latest parts leak may show apple's 'iphone 5s' fingerprint scanner $aapl URL
$aapl apple's worst kept secret: bigger iphones in 2014 URL
$aapl new high-res pics 

In [25]:
tfidf.data

array([ 0.3160494 ,  0.2486245 ,  0.12166814, ...,  0.2115839 ,
        0.20560519,  0.05370148])

In [26]:
vectorizer = CountVectorizer(min_df=1)

In [65]:
vectorizer.fit_transform([text[0]])

<1x6 sparse matrix of type '<type 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Column format>

In [66]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
#X_train = vectorizer.fit_transform(data_train.data)

In [58]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer as porter_stemmer

lemmatizer = WordNetLemmatizer()

In [60]:
def lemmatize_text(text):
	tokens = nltk.word_tokenize(text)
	sentences = nltk.sent_tokenize(text)
	words = (nltk.word_tokenize(sentence) for sentence in sentences)
	tag_tuples = [nltk.pos_tag(word) for word in words]

	new_sentence = []
	for tuples in tag_tuples:
		for word, tag in tuples:
			if tag.startswith('V'):
				new_sentence.append( lemmatizer.lemmatize(word, wn.VERB) )
				
			elif tag.startswith('J'):
				new_sentence.append( lemmatizer.lemmatize(word, wn.ADJ) )
				
			elif tag.startswith('N'):
				new_sentence.append( lemmatizer.lemmatize(word, wn.NOUN) )
				
			elif tag.startswith('R'):
				new_sentence.append( lemmatizer.lemmatize(word, wn.ADV) )
				
			else:
				new_sentence.append( word )

	print text
	print ' '.join(i for i in new_sentence)

In [64]:
lemmatize_text(cleaned_text[5])

will low cost iphones actually lower apple margins? $aapl URL
will low cost iphones actually low apple margin ? $ aapl URL


In [72]:
import CMUTweetTagger # will wrap this with a web-service

In [74]:
for text in cleaned_text[:10]:
    print text
    print CMUTweetTagger.runtagger_parse([text])
    print

AT_USER $aapl. apple's iphone has cracked.
[[('AT_USER', 'P', 0.5752), ('$aapl', '^', 0.7174), ('.', ',', 0.9668), ("apple's", 'Z', 0.6764), ('iphone', '^', 0.7309), ('has', 'V', 0.9833), ('cracked', 'V', 0.5413), ('.', ',', 0.9983)]]

$aapl is holding well in the bull flag. did you notice the golden cross on the daily? ;) URL
[[('$aapl', '^', 0.8645), ('is', 'V', 0.9961), ('holding', 'V', 0.9728), ('well', 'R', 0.8528), ('in', 'P', 0.9986), ('the', 'D', 0.9991), ('bull', 'N', 0.9745), ('flag', 'N', 0.9849), ('.', ',', 0.9979), ('did', 'V', 0.9994), ('you', 'O', 0.9957), ('notice', 'V', 0.9922), ('the', 'D', 0.999), ('golden', 'A', 0.4243), ('cross', 'N', 0.9899), ('on', 'P', 0.9987), ('the', 'D', 0.9991), ('daily', 'A', 0.5749), ('?', ',', 0.9897), (';)', 'E', 0.9774), ('URL', 'N', 0.4083)]]

$aapl if we get a pullback, load the house, don't listen to cnbc, they'll tell you to buy once it hits 560 and they exit
[[('$aapl', '^', 0.7156), ('if', 'P', 0.9991), ('we', 'O', 0.9988), ('get'