In [1]:
import os
import glob
import numpy as np
import pandas as pd
import nltk
pd.set_option('display.max_colwidth', 200) # set max number of characters can see in pd dataframe

In [2]:
all_news = pd.read_pickle("./data/newspickle.pkl")
all_news.head()

Unnamed: 0,long_date_format,headline,text,url
0,"Mon Nov 13, 2006 3:16am EST","Hitachi, GE boost alliance in nuclear power business","TOKYO (Reuters) - Hitachi Ltd. ( 6501.T ) said on Monday it has agreed with General Electric Co. ( GE.N ) to expand their global alliance in the nuclear power business, aiming to strengthen thei...",http://www.reuters.com/article/2006/11/13/us-energy-hitachi-ge-idUSTKB00256820061113
1,"Mon Nov 13, 2006 8:45am EST","Volvo to cut 1,000 staff at Virginia plant","STOCKHOLM (Reuters) - Truck maker Volvo said on Monday it would cut about 1,000 staff at its Dublin, Virginia plant in the United States due to an expected decline in output. After years of stro...",http://www.reuters.com/article/2006/11/13/us-autos-volvo-us-idUSSAT00355920061113
2,"Mon Nov 13, 2006 3:15am EST",European banks hiding full pension obligations,"ZURICH, Nov 13 (Reuter) - West European banks are failing to disclose unfunded staff pension obligations running to billions of dollars, in contrast to U.S. banks which are required to show the f...",http://www.reuters.com/article/2006/11/13/us-financial-pensions-idUSL1083562820061113
3,"Mon Nov 13, 2006 7:13am EST","Hitachi, GE to form joint nuclear power ventures",TOKYO (Reuters) - Japan's Hitachi Ltd. and U.S. group General Electric Co. said on Monday they would set up joint ventures in Japan and the United States to combine their nuclear power operation...,http://www.reuters.com/article/2006/11/13/us-energy-hitachi-ge-nuclear-idUST3593020061113
4,"Mon Nov 13, 2006 7:29am EST",Eddie Bauer agrees to be bought for $286 million,(Reuters) - Eddie Bauer Holdings Inc. EBHI.O said and it has agreed to be acquired by a company owned by affiliates of Sun Capital Partners Inc. and Golden Gate Capital for about $286 million and...,http://www.reuters.com/article/2006/11/13/us-retail-eddiebauer-suncapitalpartners-idUSWNAS309120061113


In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

if not os.path.exists('./data/labeled_news.pkl'):
    sia = SIA()
    results = []

    for i, row in all_news.iterrows():
        pol_score = sia.polarity_scores(row['text'])
        results.append(pol_score)
        
    df = pd.DataFrame.from_records(results)
    
    df['label'] = 0
    df.loc[df['compound'] > 0.2, 'label'] = 1
    df.loc[df['compound'] < -0.2, 'label'] = -1

    all_news_labeled = pd.concat([all_news, df], axis=1)

    all_news_labeled.to_pickle("./data/labeled_news.pkl")
    print("News Saved!")
    
else:
    all_news_labeled = pd.read_pickle("./data/labeled_news.pkl")
    print("News loaded")

News loaded


## Exploring the data set

In [4]:
# What is the shape of this datataset
print("Input data has {} rows and {} columns".format(len(all_news_labeled), len(all_news_labeled.columns)))

Input data has 106494 rows and 9 columns


In [5]:
# How many revies are negative and positive?

print("Out of {} rows, {} are negative, {} are neutral, {} are positive".format(len(all_news_labeled), 
                                                       len(all_news_labeled[all_news_labeled['label']==-1]), 
                                                       len(all_news_labeled[all_news_labeled['label']==0]),
                                                       len(all_news_labeled[all_news_labeled['label']==1])))

Out of 106494 rows, 32597 are negative, 6642 are neutral, 67255 are positive


In [6]:
# Ramdomize the data so there is no bias
data = all_news_labeled.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,long_date_format,headline,text,url,neg,neu,pos,compound,label
0,"Wed Dec 30, 2009 9:56pm EST",Time Warner Cable could OK arbitration with Fox,NEW YORK (Reuters) - Time Warner Cable Inc said it is willing to go to arbitration with News Corp's Fox Networks to avoid losing some of Fox's channels if the companies fail to agree on a new ca...,http://www.reuters.com/article/2009/12/31/us-timewarnercable-idUSTRE5BT3P920091231,0.028,0.892,0.08,0.9623,1
1,"Mon Sep 13, 2010 1:00am EDT",ECB's Liikanen says Basel III will not limit recovery,"BASEL, Switzerland (Reuters) - The new banking regulations will contribute to a more stable financial system and support economic growth, European Central bank Governing Council member Erkki Lii...",http://www.reuters.com/article/2010/09/13/us-ecb-liikanen-idUSTRE68C0U820100913,0.036,0.812,0.152,0.8748,1
2,"Mon Nov 22, 2010 3:50am EST",Ireland says EU/IMF unlikely to change 4-year debt plan,"DUBLIN (Reuters) - The European Union and International Monetary Fund have seen the outline of Ireland's four-year austerity plan and are unlikely to request significant changes, Finance Ministe...",http://www.reuters.com/article/2010/11/22/us-ireland-lenihan-debt-idUSTRE6AL1FV20101122,0.032,0.924,0.045,0.4019,1
3,"Fri Jan 29, 2010 10:51am EST",American Express CEO gets base pay bump,"NEW YORK (Reuters) - American Express Co ( AXP.N ) increased the base pay of Chief Executive Kenneth Chenault to $2 million from $1.25 million, the company reported on Friday in a regulatory fil...",http://www.reuters.com/article/2010/01/29/us-americanexpress-pay-idUSTRE60S43W20100129,0.065,0.805,0.13,0.975,1
4,"Thu Oct 7, 2010 2:32pm EDT","Soros says banking system remains ""too connected""","NEW YORK (Reuters) - Billionaire investor George Soros said on Thursday that the U.S. financial regulation bill does not address the problem of a banking system that is ""too connected to fail"". ...",http://www.reuters.com/article/2010/10/07/us-soros-regulation-idUSTRE6963NS20101007,0.086,0.839,0.075,-0.5749,-1


## Data Pre-processing

In this step we want to:
- remove puncuations
- transform text to lower case
- split the text into tokens (which are english words)
- remove stopwords

In [7]:
import string
import re
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [8]:
data = data[:5000]

In [9]:
clean_data = []
for i in data:
    clean_data.append(clean_text(i))

## Split Dataset into training and testing with 20% of data saved for testing 80% for training

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2)

In [11]:
X_train.shape

(4000,)

## Vectorizing Data
- Transform training and testing data into something the Machine Learning algorithm can understand.
- First it fits the data on TF-IDF vectorizer and the transforms it into feature vector space which we can then send to the input of our ML model

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

vectorizer = TfidfVectorizer(max_features=2000)
vectorizer = vectorizer.fit(X_train)
tfidf_train = vectorizer.transform(X_train)
tfidf_test = vectorizer.transform(X_test)

In [13]:
import pickle 
from sklearn.externals import joblib

joblib.dump(vectorizer, 'tfidfVectorizer.pkl')



['tfidfVectorizer.pkl']

In [14]:
X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

In [15]:
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
rf_model = rf.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

# Train and Test Accuracy
print ("Train Accuracy : {}".format(accuracy_score(y_train, rf_model.predict(X_train_vect))))
print ("Test Accuracy  : {}".format(accuracy_score(y_test, y_pred)))

Train Accuracy : 1.0
Test Accuracy  : 0.747


In [16]:
import pickle 

pkl_filename = "rf_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)

In [17]:
x = ['March 15 (Reuters) - Mobile phone chip supplier Qualcomm Inc on Friday won a court victory against iPhone maker Apple Inc, with a jury in federal court in San Diego finding that Apple infringed on three of Qualcomm’s patents, a Qualcomm spokeswoman told Reuters. (Reporting by Stephen Nellis; Editing by Richard Chang)']

tfidfVectorizer = joblib.load('tfidfVectorizer.pkl')

tfidf_train = tfidfVectorizer.transform(x)
x = pd.DataFrame(tfidf_train.toarray())

print(rf_model.predict(x))


[1]
