# **Stock market news feed semantic analysis** *(Sentiment ~ Predict)*

Ebben összehasonlítom a napok híreinek szentimentjét / érzelmét és a labelt azt vizsgálva, hogy a kettő között van e korreláció.

https://github.com/cjhutto/vaderSentiment

VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text
(by C.J. Hutto and Eric Gilbert)
Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

In [None]:
pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 14.7MB/s eta 0:00:01[K     |█████▏                          | 20kB 19.5MB/s eta 0:00:01[K     |███████▉                        | 30kB 18.1MB/s eta 0:00:01[K     |██████████▍                     | 40kB 14.4MB/s eta 0:00:01[K     |█████████████                   | 51kB 15.6MB/s eta 0:00:01[K     |███████████████▋                | 61kB 17.0MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 11.5MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 12.3MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 11.7MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 11.0MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 11.0MB/s eta 0:00:01[K     |██████████████████████████

In [None]:
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
sentence = "Market prediction using Sentiment Analysis of news feed"

analyzer = SentimentIntensityAnalyzer()

vs = analyzer.polarity_scores(sentence)

vs

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [None]:
import pandas as pd
import pandas_datareader as web
import numpy as np
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Adathalmaz beolvasása és előkészítése**

In [None]:
print("Start of the preprocess\n")

# Copy the dataset to the local environment
!cp "/content/drive/MyDrive/Combined_News_DJIA.csv" "Combined_News_DJIA.csv"

# Without merge! -> there will be voting for the day, without stop words remove
# Load the dataset 
df_combined = pd.read_csv('Combined_News_DJIA.csv', index_col = "Date")

# Load the stock data
df_stock = web.DataReader("DJIA", data_source="yahoo", start="2008-08-08", 
                          end="2016-07-01")

temp_day = []

for day in range(len(df_stock)):
    temp_day.append(df_stock.index[day].date())

df_stock.index = temp_day

difference = []

for day in range(max(len(df_combined), len(df_stock))):
    if str(df_combined.index[day]) != str(df_stock.index[day]):
        difference.append(day)

if len(difference) is 0:
    print("The dates matched!\n")

difference = []

for day in range(len(df_stock)):
    # label should be 1 -> rise
    if int(df_stock["Adj Close"][day]) >= int(df_stock["Adj Close"][day - 1]):
        if df_combined["Label"][day] != 1:
            difference.append(str(df_stock.index[day]))
            print("Problem at day " + str(df_stock.index[day]))
            print("Today: " + str(df_stock["Adj Close"][day]) +"\t\tYesterday: " + str(df_stock["Adj Close"][day - 1]) + "\t\tLabel: " + str(df_combined["Label"][day]) + "\n")

    # label should be 0 -> fall
    if int(df_stock["Adj Close"][day]) < int(df_stock["Adj Close"][day - 1]):
        if df_combined["Label"][day] != 0:
            difference.append(str(df_stock.index[day]))
            print("Problem at day " + str(df_stock.index[day]))
            print("Today: " + str(df_stock["Adj Close"][day]) +"\t\tYesterday: " + str(df_stock["Adj Close"][day - 1]) + "\t\tLabel: " + str(df_combined["Label"][day]) + "\n") 

# correct the wrong labels
for row in difference:
    if df_combined.loc[row, "Label"] == 0:
        df_combined.loc[row, "Label"] = 1
    else:
        df_combined.loc[row, "Label"] = 0

print("All differences: " + str(len(difference)) + "\nFixed!\n") 

# Find the cells with NaN and after the rows for them
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

# Replace them
df_combined = df_combined.replace(np.nan, " ")

# Check the process
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

assert len(rows_with_NaN) is 0

# Get the column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

for column in range(len(combined_column_names) - 1):
    temp_news = []

    for row in range(len(df_combined)):   
        news = df_combined[combined_column_names[column + 1]][row]
        # Remove the b character at the begining of the string
        if news[0] is "b":
            news = " " + news[1:]
        temp_news.append(news)

    temp_to_switch = []
    # Removing punctuations
    for line in temp_news:
      temp_attach = ""
      for word in line:
        temp = " "
        if word not in string.punctuation:
          temp = word
        temp_attach = temp_attach + "".join(temp)
      temp_to_switch.append(temp_attach)

    temp_news = temp_to_switch
    temp_to_switch = []
    # Remove numbers
    for line in temp_news:
      temp_attach = ""
      for word in line:
        temp = " "
        if not word.isdigit():
          temp = word
        temp_attach = temp_attach + "".join(temp)
      temp_to_switch.append(temp_attach)

    temp_news = temp_to_switch
    # Remove space
    for line in range(len(temp_news)):    
      temp_news[line] = " ".join(temp_news[line].split())

    # Converting headlines to lower case
    for line in range(len(temp_news)): 
        temp_news[line] = temp_news[line].lower()

    # update
    df_combined[combined_column_names[column + 1]] = temp_news

# Show the data frame
print(df_combined.head())
print()
print(df_stock.head())

print("\nFirst full sentence:\n" + str(df_combined["Top1"][0]))
print("\nFirst label:\n" + str(df_sum_news_labels["Label"][0]))


Start of the preprocess

The dates matched!

Problem at day 2010-10-14
Today: 11096.919921875		Yesterday: 11096.080078125		Label: 0

Problem at day 2012-11-12
Today: 12815.080078125		Yesterday: 12815.3896484375		Label: 0

Problem at day 2012-11-15
Today: 12570.9501953125		Yesterday: 12570.9501953125		Label: 0

Problem at day 2013-04-12
Today: 14865.0595703125		Yesterday: 14865.1396484375		Label: 0

Problem at day 2014-04-24
Today: 16501.650390625		Yesterday: 16501.650390625		Label: 0

Problem at day 2015-08-12
Today: 17402.509765625		Yesterday: 17402.83984375		Label: 0

Problem at day 2015-11-27
Today: 17813.390625		Yesterday: 17813.390625		Label: 0

All differences: 7
Fixed!

            Label  ...                                              Top25
Date               ...                                                   
2008-08-08      0  ...              no help for mexico s kidnapping surge
2008-08-11      1  ...  so this is what it s come to trading sex for food
2008-08-12      0 

## **Szentiment meghatározása**

In [None]:
analyzer = SentimentIntensityAnalyzer()

# Get the column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

for column in range(len(combined_column_names) - 1):
    temp_sentiment = []

    for row in range(len(df_combined)):   
        news = df_combined[combined_column_names[column + 1]][row]

        vs = analyzer.polarity_scores(news)

        if float(vs.get("compound")) >= 0.05:
            temp_sentiment.append("1")
        elif float(vs.get("compound")) <= -0.05:
            temp_sentiment.append("0")
        else:
            temp_sentiment.append("-")

    # update
    df_combined[combined_column_names[column + 1]] = temp_sentiment

In [None]:
df_combined

Unnamed: 0_level_0,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2008-08-08,0,0,-,0,0,0,-,0,1,0,0,0,0,0,-,0,0,-,0,1,-,0,0,0,0,0
2008-08-11,1,1,0,1,0,-,0,0,0,0,0,-,1,-,0,0,0,0,0,-,0,0,-,0,0,-
2008-08-12,0,-,-,0,0,-,0,0,0,1,0,0,0,0,-,1,-,-,1,0,0,0,1,1,-,-
2008-08-13,0,0,0,0,0,0,-,1,-,1,1,1,0,0,0,-,-,-,-,0,-,0,1,0,0,0
2008-08-14,1,1,0,1,0,0,0,1,1,0,0,-,-,1,1,-,0,-,0,0,-,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-06-27,0,0,1,0,0,1,0,1,0,0,-,1,0,-,-,0,0,1,0,0,1,1,0,0,1,1
2016-06-28,1,1,0,1,1,0,0,-,1,0,-,-,-,1,0,-,1,0,-,1,1,-,1,-,0,0
2016-06-29,1,-,0,1,0,0,-,0,1,1,1,1,0,0,0,0,1,-,0,0,1,0,0,1,0,-
2016-06-30,1,-,0,1,0,0,0,0,1,0,0,1,1,0,-,0,1,-,0,0,0,0,-,0,0,0


In [None]:
# Get the column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)


sent_sum = []

for row in range(len(df_combined)):
    pos_counter = 0
    neg_counter = 0

    for column in range(len(combined_column_names) - 1):   
        sentiment = df_combined[combined_column_names[column + 1]][row]

        if str(sentiment) == "1":
            pos_counter = pos_counter + 1
        elif str(sentiment) == "0":
            neg_counter = neg_counter + 1
        else:
            pass

    if pos_counter > neg_counter:
        sent_sum.append("1")
    elif pos_counter < neg_counter:
        sent_sum.append("0")
    else:
        sent_sum.append("-")

for column in range(len(df_combined.columns) - 1):  
    df_combined.drop(df_combined.columns[1], axis=1, inplace=True)

df_combined["Sentiment"] = sent_sum

In [None]:
df_combined

Unnamed: 0_level_0,Label,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-08-08,0,0
2008-08-11,1,0
2008-08-12,0,0
2008-08-13,0,0
2008-08-14,1,0
...,...,...
2016-06-27,0,0
2016-06-28,1,1
2016-06-29,1,0
2016-06-30,1,0


In [None]:
match = []
good = 0
for row in range(len(df_combined)):
    if str(df_combined["Label"][row]) == str(df_combined["Sentiment"][row]):
        match.append("1")
        good = good + 1

    else:
        match.append("0")

df_combined["Match"] = match
df_combined

Unnamed: 0_level_0,Label,Sentiment,Match
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-08-08,0,0,1
2008-08-11,1,0,0
2008-08-12,0,0,1
2008-08-13,0,0,1
2008-08-14,1,0,0
...,...,...,...
2016-06-27,0,0,1
2016-06-28,1,1,1
2016-06-29,1,0,0
2016-06-30,1,0,0


In [None]:
good/len(df_combined)

0.4555052790346908

Neutrálisok nélkül.

In [None]:
neut = 0
for row in range(len(df_combined)):
    if str(df_combined["Sentiment"][row]) == "-":
        neut = neut + 1
    else:
        match.append("0")

good/(len(df_combined)-neut)

0.4596651445966514