In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import gensim
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def removeB(text):
    result = text
    if (text.startswith("b ")):
        result = text[2:]
    return result

In [3]:
#This is common section for all classifiers/techniques

#Read CSV file containing news
news_data = pd.read_csv("Combined_News_DJIA.csv", encoding='ISO-8859-1')

#Train test split
train = news_data[news_data['Date'] < '20150101']
test = news_data[news_data['Date'] > '20141231']

##Preprocessing of training set
# Removing special characters
data=train.iloc[:,2:27]
data.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

# Renaming column names for better understanding and ease of access
list1= [i for i in range(25)]
new_index=[str(i) for i in list1]
data.columns= new_index
data.head(5)

for col in data.columns:
    data[col] = data[col].apply(lambda x: removeB(str(x))).str.lower()
    
##Combining all headlines columns and create a single list per row
headlines1 = []
for row in range(0,len(data.index)):
    headlines1.append(' '.join(str(x) for x in data.iloc[row, 0:25]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))

print(headlines)



In [4]:
#Type this command on powershell window if there is IO error
#jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [5]:
## implement TF IDF
tfidf = TfidfVectorizer()
traindataset = tfidf.fit_transform(headlines)

In [7]:
tfidf.vocabulary_

{'georgia': 10102,
 'down': 7300,
 'russian': 21716,
 'warplane': 27305,
 'country': 5466,
 'brink': 3218,
 'war': 27270,
 'breaking': 3138,
 'musharraf': 16334,
 'impeached': 12056,
 'russia': 21715,
 'today': 25429,
 'column': 4735,
 'troop': 25841,
 'roll': 21522,
 'south': 23521,
 'ossetia': 17861,
 'footage': 9450,
 'fighting': 9113,
 'youtube': 28078,
 'tank': 24863,
 'moving': 16192,
 'capital': 3699,
 'reportedly': 20984,
 'completely': 4886,
 'destroyed': 6578,
 'georgian': 10103,
 'artillery': 1431,
 'afghan': 468,
 'child': 4223,
 'raped': 20271,
 'impunity': 12145,
 'official': 17573,
 'say': 22034,
 'sick': 22906,
 'year': 27980,
 'old': 17622,
 'entered': 8134,
 'whilst': 27524,
 'shoot': 22796,
 'jet': 13116,
 'invades': 12744,
 'warned': 27298,
 'intervene': 12688,
 'enemy': 8038,
 'combatent': 4744,
 'trial': 25776,
 'sham': 22610,
 'salim': 21848,
 'haman': 10848,
 'sentenced': 22444,
 'kept': 13516,
 'longer': 14571,
 'feel': 9004,
 'like': 14362,
 'retreat': 21227,


In [8]:
sentiment_analysis = train[['Date', 'Label']]

from afinn import Afinn
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

afn = Afinn()
scores = [afn.score(headline) for headline in headlines]
sentiment = [1 if score >= 0 else 0 for score in scores]
sentiment_analysis['Afinn_result'] = sentiment

scores = [round(TextBlob(headline).sentiment.polarity, 3) for headline in headlines]
sentiment = [1 if score >= 0 else 0 for score in scores]
sentiment_analysis['textBlob_result'] = sentiment

sia = SentimentIntensityAnalyzer()
scores = [sia.polarity_scores(headline)['compound'] for headline in headlines]
sentiment = [1 if score >= 0 else 0 for score in scores]
sentiment_analysis['vader_result'] = sentiment

In [9]:
sentiment_analysis

Unnamed: 0,Date,Label,Afinn_result,textBlob_result,vader_result
0,2008-08-08,0,0,0,0
1,2008-08-11,1,0,1,0
2,2008-08-12,0,0,0,0
3,2008-08-13,0,0,1,0
4,2008-08-14,1,0,1,0
...,...,...,...,...,...
1858,2015-12-24,0,0,1,0
1859,2015-12-28,0,0,0,0
1860,2015-12-29,1,0,1,0
1861,2015-12-30,0,0,0,0


In [10]:
print(accuracy_score(sentiment_analysis.Label, sentiment_analysis.Afinn_result))
print(accuracy_score(sentiment_analysis.Label, sentiment_analysis.textBlob_result))
print(accuracy_score(sentiment_analysis.Label, sentiment_analysis.vader_result))

0.4659151905528717
0.5131508319914116
0.4659151905528717


In [11]:
## implement LogisticRegression Classifier
model = LogisticRegression()
model.fit(traindataset,train['Label'])

# Preprocessing of testing data
data_test=test.iloc[:,2:27]
data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for col in data_test.columns:
    data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

## Predict for the Test Dataset
headlines1= []
for row in range(0,len(data_test.index)):
    headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))
    
test_dataset = tfidf.transform(headlines)

#Update this section for each classifier
predictions = model.predict(test_dataset)
        
# matrix = confusion_matrix(test['Label'],predictions)
# print(matrix)
score = accuracy_score(test['Label'],predictions)
print(score)
# report = classification_report(test['Label'],predictions)
# print(report)

0.8148148148148148


In [13]:
## implement RandomForest Classifier
randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')
randomclassifier.fit(traindataset,train['Label'])

# Preprocessing of testing data
data_test=test.iloc[:,2:27]
data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for col in data_test.columns:
    data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

## Predict for the Test Dataset
headlines1= []
for row in range(0,len(data_test.index)):
    headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))
    
test_dataset = tfidf.transform(headlines)

#Update this section for each classifier
predictions = randomclassifier.predict(test_dataset)
        
# matrix = confusion_matrix(test['Label'],predictions)
# print(matrix)
score = accuracy_score(test['Label'],predictions)
print(score)
# report = classification_report(test['Label'],predictions)
# print(report)

0.8386243386243386


In [14]:
#Section for KNN
from sklearn.neighbors import KNeighborsClassifier

for knn_n in [5,10,20,30,40]:
    model_knn = KNeighborsClassifier(n_neighbors=knn_n)
    model_knn.fit(traindataset, train.Label)

    # Preprocessing of testing data
    data_test=test.iloc[:,2:27]
    data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

    for col in data_test.columns:
        data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

    ## Predict for the Test Dataset
    headlines1= []
    for row in range(0,len(data_test.index)):
        headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))

    #Removal of stop words
    headlines2=[]
    for headline in headlines1:
        headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

    #Stemming
    ps = PorterStemmer()
    headlines3 = [ps.stem(headline) for headline in headlines2]

    #Lemmatization
    lemmatizer = WordNetLemmatizer()
    headlines=[]
    for s in headlines3:
        stn = []
        for word in word_tokenize(s):
            stn.append(lemmatizer.lemmatize(word))
        headlines.append(' '.join(word for word in stn))

    test_dataset = tfidf.transform(headlines)

    #Update this section for each classifier
    predictions = model_knn.predict(test_dataset)

    #Print accuraccy
    score = accuracy_score(test['Label'],predictions)
#     print("Accuracy scores :")
    print("k={}:".format(knn_n), score)

k=5: 0.6349206349206349
k=10: 0.6031746031746031
k=20: 0.5582010582010583
k=30: 0.5608465608465608
k=40: 0.49206349206349204


In [15]:
#Section for Decision Tree
from sklearn.tree import DecisionTreeClassifier


model_dt = DecisionTreeClassifier()
model_dt.fit(traindataset, train.Label)

# Preprocessing of testing data
data_test=test.iloc[:,2:27]
data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for col in data_test.columns:
    data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

## Predict for the Test Dataset
headlines1= []
for row in range(0,len(data_test.index)):
    headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))
    
test_dataset = tfidf.transform(headlines)

#Update this section for each classifier
predictions = model_dt.predict(test_dataset)

#Print accuraccy
score = accuracy_score(test['Label'],predictions)
#     print("Accuracy scores :")
print("Accuracy of the model is :", score)

Accuracy of the model is : 0.6851851851851852


In [16]:
#Section for Naive Bayes
from sklearn.naive_bayes import GaussianNB


model_nb = GaussianNB()
model_nb.fit(traindataset.toarray(), train.Label)

# Preprocessing of testing data
data_test=test.iloc[:,2:27]
data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for col in data_test.columns:
    data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

## Predict for the Test Dataset
headlines1= []
for row in range(0,len(data_test.index)):
    headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))
    
test_dataset = tfidf.transform(headlines)

#Update this section for each classifier
predictions = model_nb.predict(test_dataset.toarray())

#Print accuraccy
score = accuracy_score(test['Label'],predictions)
#     print("Accuracy scores :")
print("Accuracy of the model is :", score)

Accuracy of the model is : 0.8465608465608465


In [17]:
#Section for AdaBoost
from sklearn.ensemble import AdaBoostClassifier

model_ab = AdaBoostClassifier()
model_ab.fit(traindataset, train.Label)

# Preprocessing of testing data
data_test=test.iloc[:,2:27]
data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for col in data_test.columns:
    data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

## Predict for the Test Datasett
## Predict for the Test Dataset
headlines1= []
for row in range(0,len(data_test.index)):
    headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))
    
test_dataset = tfidf.transform(headlines)

#Update this section for each classifier
predictions = model_ab.predict(test_dataset)

#Print accuraccy
score = accuracy_score(test['Label'],predictions)
#     print("Accuracy scores :")
print("Accuracy of the model is :", score)

Accuracy of the model is : 0.58994708994709


In [18]:
#Section for XGBoost
from xgboost import XGBClassifier

model_xg = XGBClassifier()
model_xg.fit(traindataset, train.Label)

# Preprocessing of testing data
data_test=test.iloc[:,2:27]
data_test.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for col in data_test.columns:
    data_test[col] = data_test[col].apply(lambda x: removeB(str(x))).str.lower()

## Predict for the Test Datasett
## Predict for the Test Dataset
headlines1= []
for row in range(0,len(data_test.index)):
    headlines1.append(' '.join(str(x) for x in data_test.iloc[row,2:27]))
    
#Removal of stop words
headlines2=[]
for headline in headlines1:
    headlines2.append(gensim.parsing.preprocessing.remove_stopwords(headline))

#Stemming
ps = PorterStemmer()
headlines3 = [ps.stem(headline) for headline in headlines2]

#Lemmatization
lemmatizer = WordNetLemmatizer()
headlines=[]
for s in headlines3:
    stn = []
    for word in word_tokenize(s):
        stn.append(lemmatizer.lemmatize(word))
    headlines.append(' '.join(word for word in stn))
    
test_dataset = tfidf.transform(headlines)

#Update this section for each classifier
predictions = model_xg.predict(test_dataset)

#Print accuraccy
score = accuracy_score(test['Label'],predictions)
#     print("Accuracy scores :")
print("Accuracy of the model is :", score)

Accuracy of the model is : 0.7936507936507936
