In [134]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import re
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amank\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [255]:
#loading the dataset which we created earlier
ds0= pd.read_csv("C:\\Dev\\mycode\\Ds_Apple.csv")
ds1= pd.read_csv("C:\\Dev\\stock_news_data\\Ds_Apple_04-26-2020.csv")
ds2= pd.read_csv("C:\\Dev\\stock_news_data\\Ds_Apple_04-28-2020.csv")
frames=[ds0,ds1,ds2]
df=pd.concat(frames)

In [256]:
#checking the data
df.head()

Unnamed: 0.1,Unnamed: 0,Firm,Date,Headlines,Label,Description,Open,Close
0,0,Apple,11/20/2006,"Mac fans buzzing about expected Apple ""iPhone""",1,SAN FRANCISCO Reuters The longrumored arriv...,12.2,12.352858
1,1,Apple,12/13/2006,Apple's iTunes music sales collapses in H1: su...,1,AMSTERDAM Reuters Sales at Apples online mu...,12.564285,12.721429
2,2,Apple,12/13/2006,Piper Jaffray disputes report of weak iTunes s...,1,NEW YORK Reuters Digital music sales surged...,12.564285,12.721429
3,3,Apple,12/15/2006,Apple files to delay annual report for option ...,0,WASHINGTON Reuters Apple Computer Inc AAPL...,12.717143,12.531428
4,4,Apple,12/27/2006,Apple shares recover,1,BOSTON Reuters Apple Computer Inc shares fe...,11.164286,11.645715


In [257]:
#getting the details of data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1011 entries, 0 to 95
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   1011 non-null   int64  
 1   Firm         1011 non-null   object 
 2   Date         1011 non-null   object 
 3   Headlines    1011 non-null   object 
 4   Label        1011 non-null   int64  
 5   Description  1011 non-null   object 
 6   Open         1011 non-null   float64
 7   Close        1011 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 71.1+ KB


In [258]:
#defining the label
y=df['Label']
col_name='Headlines' #either 'Description' or Headlines

In [259]:
#processing the text of headline/Description

# Remove all the special characters
df[col_name]=df[col_name].map(lambda x: re.sub(r'\W+', ' ', x))

# remove all single characters
df[col_name]=df[col_name].map(lambda x: re.sub(r'\s+[a-zA-Z]\s+',' ',x))

# Remove single characters from the start
df[col_name]=df[col_name].map(lambda x: re.sub(r'\^[a-zA-Z]\s+',' ',x))

#Substituting multiple spaces with single space
df[col_name]=df[col_name].map(lambda x: re.sub(r'\s+', ' ',x,flags=re.I))

# Removing prefixed 'b'
df[col_name]=df[col_name].map(lambda x: re.sub(r'^b\s+', ' ',x))

In [260]:
#stemming

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]

#df['Description']= df['Description'].apply(lemmatize_text)

In [261]:
X_train, X_test, y_train, y_test=train_test_split(df[col_name],y,test_size=0.33, random_state=53)

In [262]:
count_vectorizer = CountVectorizer(max_features=2500, min_df=7, max_df=0.8,stop_words='english',tokenizer=lemmatize_text)

In [263]:
print(count_vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=2500, min_df=7,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function lemmatize_text at 0x000001888F1D6948>,
                vocabulary=None)


In [264]:
count_train=count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

  'stop_words.' % sorted(inconsistent))


In [265]:
pd.DataFrame(count_train.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115,116,117,118,119,120,121,122,123,124
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
674,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [267]:
count_vectorizer.get_feature_names()

['1',
 '3',
 '4',
 '5',
 '9',
 'aapl',
 'ahead',
 'alpha',
 'analysis',
 'analyst',
 'analysts',
 'antitrust',
 'ban',
 'battle',
 'beat',
 'big',
 'board',
 'book',
 'buy',
 'case',
 'cash',
 'ceo',
 'china',
 'coinspeaker',
 'com',
 'company',
 'cook',
 'court',
 'cut',
 'data',
 'deal',
 'delay',
 'disappoint',
 'dividend',
 'drop',
 'earn',
 'einhorn',
 'eu',
 'event',
 'exclusive',
 'expect',
 'expectations',
 'eye',
 'face',
 'factbox',
 'fall',
 'fight',
 'finance',
 'fool',
 'forecast',
 'google',
 'growth',
 'health',
 'high',
 'hit',
 'instant',
 'investors',
 'investorsobserver',
 'ipad',
 'iphone',
 'iphones',
 'itunes',
 'job',
 'judge',
 'know',
 'launch',
 'look',
 'mac',
 'make',
 'market',
 'microsoft',
 'million',
 'mini',
 'mobile',
 'motley',
 'music',
 'nasdaq',
 'new',
 'news',
 'nokia',
 'offer',
 'options',
 'order',
 'outlook',
 'patent',
 'phone',
 'plan',
 'price',
 'profit',
 'quarter',
 'record',
 'report',
 'result',
 'return',
 'rise',
 'rule',
 's',
 'sa

In [268]:
pd.DataFrame(count_train.toarray(), columns=count_vectorizer.get_feature_names())

Unnamed: 0,1,3,4,5,9,aapl,ahead,alpha,analysis,analyst,...,tech,time,trade,trial,u,unveil,view,win,work,yahoo
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
674,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [269]:
nb_classifier = MultinomialNB()

In [270]:
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [271]:
pred = nb_classifier.predict(count_test)

In [272]:
metrics.accuracy_score(y_test, pred)

0.5449101796407185

In [273]:
metrics.confusion_matrix(y_test, pred, labels=[0,1])

array([[ 66,  93],
       [ 59, 116]], dtype=int64)

In [274]:
alphas = np.arange(0, 1, 0.1)
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(count_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(count_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

In [275]:
#Checking the performance with different alpha values
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.5568862275449101

Alpha:  0.1
Score:  0.5568862275449101

Alpha:  0.2
Score:  0.5568862275449101

Alpha:  0.30000000000000004
Score:  0.5538922155688623

Alpha:  0.4
Score:  0.5568862275449101

Alpha:  0.5
Score:  0.5568862275449101

Alpha:  0.6000000000000001
Score:  0.5508982035928144

Alpha:  0.7000000000000001
Score:  0.5508982035928144

Alpha:  0.8
Score:  0.5508982035928144

Alpha:  0.9
Score:  0.5508982035928144



  'setting alpha = %.1e' % _ALPHA_MIN)


In [276]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = count_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

0 [(-6.35350009728881, 'battle'), (-6.35350009728881, 'offer'), (-5.948034989180646, 'expectations'), (-5.948034989180646, 'eye'), (-5.948034989180646, 'forecast'), (-5.948034989180646, 'investorsobserver'), (-5.948034989180646, 'know'), (-5.948034989180646, 'outlook'), (-5.948034989180646, 'phone'), (-5.948034989180646, 'rule'), (-5.948034989180646, 'u'), (-5.660352916728865, '4'), (-5.660352916728865, '9'), (-5.660352916728865, 'court'), (-5.660352916728865, 'deal'), (-5.660352916728865, 'exclusive'), (-5.660352916728865, 'iphones'), (-5.660352916728865, 'nokia'), (-5.660352916728865, 'rise'), (-5.660352916728865, 'strong')]
1 [(-4.407589948233497, 'earn'), (-4.407589948233497, 'finance'), (-4.407589948233497, 's'), (-4.407589948233497, 'steve'), (-4.407589948233497, 'yahoo'), (-4.338597076746545, 'ceo'), (-4.338597076746545, 'china'), (-4.338597076746545, 'launch'), (-4.274058555608974, 'seek'), (-4.050915004294765, 'sales'), (-4.002124840125333, 'samsung'), (-3.9556048244904396, 'r

In [277]:
#with Tfidf
#tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words='english',use_idf=True)

tfidf_train=tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train,y_train)
pred = nb_classifier.predict(tfidf_test)
score = metrics.accuracy_score(y_test,pred)
print(score)
cm = metrics.confusion_matrix(y_test, pred, labels=[0,1])
print(cm)

0.5658682634730539
[[ 57 102]
 [ 43 132]]


In [278]:
alphas = np.arange(0, 1, 0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

In [279]:
# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.5688622754491018

Alpha:  0.1
Score:  0.5658682634730539

Alpha:  0.2
Score:  0.5658682634730539

Alpha:  0.30000000000000004
Score:  0.562874251497006

Alpha:  0.4
Score:  0.5598802395209581

Alpha:  0.5
Score:  0.5598802395209581

Alpha:  0.6000000000000001
Score:  0.5598802395209581

Alpha:  0.7000000000000001
Score:  0.5598802395209581

Alpha:  0.8
Score:  0.5568862275449101

Alpha:  0.9
Score:  0.5538922155688623



  'setting alpha = %.1e' % _ALPHA_MIN)


In [280]:
first_vector_tfidfvectorizer=tfidf_train[2]
 
# place tf-idf values in a pandas data frame
df_temp = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df_temp=df_temp.sort_values(by=["tfidf"],ascending=False)
df_temp.head(10)

Unnamed: 0,tfidf
sources,0.614558
book,0.57858
market,0.536249
aapl,0.0
news,0.0
record,0.0
quarter,0.0
profit,0.0
price,0.0
phone,0.0


In [281]:
#inspecting the model

# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

0 [(-5.704700368384229, 'sell'), (-5.679637171009755, 'outlook'), (-5.676576012462584, 'know'), (-5.669430227265, 'price'), (-5.624911186506288, 'expectations'), (-5.5771105276414845, 'phone'), (-5.530195277806578, 'investorsobserver'), (-5.5210099671894275, 'cuts'), (-5.503146711065383, 'time'), (-5.452596701207903, 'strong'), (-5.386945713502266, 'disappoint'), (-5.326976573079296, 'court'), (-5.310557468358194, 'eu'), (-5.310011694144957, 'return'), (-5.2833860945497895, 'deal'), (-5.27938928704308, 'nokia'), (-5.272123127398281, 'exclusive'), (-5.269868036132053, 'microsoft'), (-5.248841026163111, 'iphones'), (-5.223517516211487, 'music')]
1 [(-4.447813565524789, 'million'), (-4.444946508884515, 'finance'), (-4.401587157237346, 'launch'), (-4.39036488090326, 'alpha'), (-4.39036488090326, 'seeking'), (-4.258633211363701, 'earnings'), (-4.254551555549038, 'ceo'), (-4.138096122398471, 'steve'), (-4.119769628283791, 'china'), (-4.045649522779904, 'sales'), (-3.948080250243085, 'stock')

In [282]:
#Support Vector Machine
# https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python good example to implement it with lot of
#details

#import SVM
from sklearn import svm
# referring back to split dataset
#X_train, X_test, y_train, y_test=train_test_split(df[col_name],y,test_size=0.33, random_state=53)

#Create a svm Classifier
clf = svm.SVC(kernel='linear', C=1) # Linear Kernel

#Train the model using the training sets - here we took the training dataset which is in vecotrized format by countvectorizer
clf.fit(count_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(count_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.6167664670658682
Precision: 0.6
Recall: 0.8057142857142857


In [243]:
print(clf)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
