# Run Naive Bayes Classifier on the Text Data to Predict Share Price Direction

- As opposed to using the existing Naive Bayes Classifier trained on a movie review corpus, we attempted to fit the classifier directly to the articles and corresponding daily share price returns

In [280]:
import csv
from yahoo_historical import Fetcher
import numpy as np
import pandas as pd

# Naive Bayes Classifier 
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

## 1. Downlaod the article csv file

- Change the directory to where the csv file is saved
- Extract only relevant data 
- dates, abstract, headline, paragraph and snippet

In [222]:
with open('news_data.csv') as csvfile:
    
    readCSV = csv.reader(csvfile)
    
    dates = []
    abstracts = []
    headlines = []
    snippets = []
    paragraphs = []
    
    for row in readCSV:
        abstract = row[0]
        date = row[1]
        headline = row[4]
        paragraph = row[8]
        snippet = row[9]
        
        dates.append(date)
        abstracts.append(abstract)
        headlines.append(headline)
        snippets.append(snippet)
        paragraphs.append(paragraph)
        

In [239]:
# Change the news into data frame
news = {'date': dates, 'abstract': abstracts, 'headline': headlines, 'snippet': snippets, 'paragraph': paragraphs}
news_df = pd.DataFrame(news)
news_df = news_df.iloc[1:len(news_df['abstract']),]

# Group the data frame by date and get the aggregate articles per day
#news_df_grouped = news_df.groupby('date')['abstract'].sum()
news_df_grouped = news_df.groupby('date').agg({'abstract':'sum', 'headline':'sum', 'snippet':'sum', 'paragraph':'sum'})
news_df_grouped = pd.DataFrame(news_df_grouped.reset_index())

# Check
news_df_grouped.head()

Unnamed: 0,date,abstract,headline,snippet,paragraph
0,2014-01-07,Apple’s iCloud data service works best with Ap...,Taking Along iCloud Calendars,Apple’s iCloud data service works best with Ap...,Q. Our family life depends on Apple shared cal...
1,2014-01-09,"In March, Apple and Samsung Electronics are sc...",Apple and Samsung Chiefs to Meet a Mediator Ah...,"In March, Apple and Samsung Electronics are sc...","In March, Apple and Samsung Electronics are sc..."
2,2014-01-10,Last year was the first in which personal comp...,"For PC Makers, the Good News on 2013 Is That I...",Last year was the first in which personal comp...,The two leading analysis companies tracking th...
3,2014-01-13,To draw young buyers and increase its market s...,Cost of Cool in India? An iPhoneDaily Report: ...,To draw young buyers and increase its market s...,"BANGALORE, India — After deliberating for mont..."
4,2014-01-14,This is not the usual start-up acquisition: Th...,Google and Nest: Two Companies in the Business...,This is not the usual start-up acquisition: Th...,"Google has announced it is buying Nest Labs, m..."


## 2. Download Share Prices from Yahoo and Compute Log Differences

In [240]:
data = Fetcher("AAPL", [2014,1,1], [2019,4,30])
shareprice_df = data.getHistorical()

# Use Adjusted Close Price
close_price = shareprice_df.iloc[:, [0,5]]

# Covert the close price to np.array to get log differences
np_price = np.array(close_price.iloc[:,[1]])
log_diff_price = np.diff(np.log(np_price).reshape(1,len(np_price))).reshape(len(np_price)-1, 1)

# Create the binary response variable
binary_response = ['pos' if i > 0 else 'neg' for i in log_diff_price]

# Create the data frame of share price returns (log differences)
log_diff = close_price.iloc[1:len(np_price),[0]]
log_diff.insert(1, 'log_diff_price', log_diff_price)
log_diff.insert(2, 'binary_response', binary_response)
log_diff.rename(columns={'Date':'date', 'log_diff_price':'log_diff', 'binary_response':'binary_response'}, inplace=True)

# Check
log_diff.head()


Unnamed: 0,date,log_diff,binary_response
1,2014-01-03,-0.02221,neg
2,2014-01-06,0.005438,pos
3,2014-01-07,-0.007178,neg
4,2014-01-08,0.006313,pos
5,2014-01-09,-0.012852,neg


## 3. Create the join table 
- Since dates are different between the two data sets, we will use inner join to extract the data where we have both information

In [242]:
news_sp = pd.merge(log_diff, news_df_grouped, on = 'date', how='inner')
news_sp.head()

Unnamed: 0,date,log_diff,binary_response,abstract,headline,snippet,paragraph
0,2014-01-07,-0.007178,neg,Apple’s iCloud data service works best with Ap...,Taking Along iCloud Calendars,Apple’s iCloud data service works best with Ap...,Q. Our family life depends on Apple shared cal...
1,2014-01-09,-0.012852,neg,"In March, Apple and Samsung Electronics are sc...",Apple and Samsung Chiefs to Meet a Mediator Ah...,"In March, Apple and Samsung Electronics are sc...","In March, Apple and Samsung Electronics are sc..."
2,2014-01-10,-0.006695,neg,Last year was the first in which personal comp...,"For PC Makers, the Good News on 2013 Is That I...",Last year was the first in which personal comp...,The two leading analysis companies tracking th...
3,2014-01-13,0.005221,pos,To draw young buyers and increase its market s...,Cost of Cool in India? An iPhoneDaily Report: ...,To draw young buyers and increase its market s...,"BANGALORE, India — After deliberating for mont..."
4,2014-01-14,0.019703,pos,This is not the usual start-up acquisition: Th...,Google and Nest: Two Companies in the Business...,This is not the usual start-up acquisition: Th...,"Google has announced it is buying Nest Labs, m..."


## 4. Create a list  of training and test sets for Naive Bayes Classifier

- There are 785 data points in the data set
- We will use 700 data points as a training set which is ~90%
- We will use the rest as a test set 
- Choose which text to fit the classifier
- A classifier based on the Naive Bayes algorithm, as implemented in NLTK.

In [277]:
def nbc(text = 'abstract'):
    
    if text == 'abstract':
        no = 3
    elif text == 'headline':
        no = 4
    elif text == 'snippet':
        no = 5
    elif text == 'paragraph':
        no = 6
    
    df_list = [(news_sp.values[i,no], news_sp.values[i,2]) for i in range(len(news_sp))]
    train = df_list[0:700]
    test = df_list[701:len(df_list)+1]
    
    cl = NaiveBayesClassifier(train)
    accuracy = cl.accuracy(test)

    return [cl, accuracy]

**Use the function to create classifier**

In [275]:
cl_abstract = nbc('abstract')

**Use show_informative_features**

In [279]:
cl_abstract[0].show_informative_features(20)

Most Informative Features
            contains(we) = True              neg : pos    =      7.6 : 1.0
         contains(worth) = True              pos : neg    =      7.1 : 1.0
           contains(yet) = True              neg : pos    =      6.9 : 1.0
        contains(showed) = True              neg : pos    =      6.1 : 1.0
         contains(women) = True              neg : pos    =      5.4 : 1.0
  contains(improvements) = True              neg : pos    =      5.4 : 1.0
        contains(appeal) = True              neg : pos    =      5.4 : 1.0
        contains(Street) = True              pos : neg    =      5.2 : 1.0
       contains(lawyers) = True              neg : pos    =      4.7 : 1.0
      contains(pressure) = True              neg : pos    =      4.7 : 1.0
          contains(sign) = True              neg : pos    =      4.7 : 1.0
       contains(improve) = True              neg : pos    =      4.7 : 1.0
      contains(internet) = True              neg : pos    =      4.7 : 1.0

**Use classify function to classify the text**

In [291]:
cl_abstract[0].classify('women')

'pos'

**Compute Accuracy Rate on Test Set**

In [276]:
cl_abstract[1]

0.5833333333333334