# **Stock market news feed semantic analysis** *(Baseline dataset merge)*

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import pandas_datareader as web
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize  
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
# Random seed
RANDOM_SEED = 1234

# Numpy random seed
NP_SEED = 1234

# Max iteration for training
MAX_ITER = 100000

# Train size
TRAIN_SPLIT = 0.85

# Test size
TEST_SPLIT = 0.15

# Shuffle cycle number for the dataframe
SHUFFLE_CYCLE = 500

In [14]:
np.random.seed(NP_SEED)

## **Adathalmazok betöltése és előkészítése**

A Reddites, egyesével, preprocessel.

In [15]:
# Copy the dataset to the local environment
!cp "/content/drive/MyDrive/Combined_News_DJIA.csv" "Combined_News_DJIA.csv"

ROWS = 1

# Load the dataset 
df_combined = pd.read_csv('Combined_News_DJIA.csv', index_col = "Date")

# Load the stock data
df_stock = web.DataReader("DJIA", data_source="yahoo", start="2008-08-08", end="2016-07-01")

# Date convert
temp_day = []

for day in range(len(df_stock)):
    temp_day.append(str(df_stock.index[day].date()))

df_stock.index = temp_day

# Labels check
difference = []

if len(df_combined) == len(df_stock):
    print("The lengths are the same!")

for day in range(max(len(df_combined), len(df_stock))):
    if str(df_combined.index[day]) != str(df_stock.index[day]):
        print("There is difference at: " + str(day) + " index")
        print("News: " + str(df_combined.index[day]) + "\tStock: " + str(df_stock.index[day]))
        difference.append(day)

if len(difference) is 0:
    print("The dates matched!")

difference = []

for day in range(len(df_stock)):
    # label should be 1 -> rise
    if int(df_stock["Adj Close"][day]) >= int(df_stock["Adj Close"][day - 1]):
        if df_combined["Label"][day] != 1:
            difference.append(str(df_stock.index[day]))
            print("Problem at day " + str(df_stock.index[day]))
            print("Today: " + str(df_stock["Adj Close"][day]) +"\t\tYesterday: " + str(df_stock["Adj Close"][day - 1]) + "\t\tLabel: " + str(df_combined["Label"][day]) + "\n")

    # label should be 0 -> fall
    if int(df_stock["Adj Close"][day]) < int(df_stock["Adj Close"][day - 1]):
        if df_combined["Label"][day] != 0:
            difference.append(str(df_stock.index[day]))
            print("Problem at day " + str(df_stock.index[day]))
            print("Today: " + str(df_stock["Adj Close"][day]) +"\t\tYesterday: " + str(df_stock["Adj Close"][day - 1]) + "\t\tLabel: " + str(df_combined["Label"][day]) + "\n")

print("All differences: " + str(len(difference)))   

# correct the wrong labels
for row in difference:
    if df_combined.loc[row, "Label"] == 0:
        df_combined.loc[row, "Label"] = 1
    else:
        df_combined.loc[row, "Label"] = 0

# check them
for row in difference:
    print(str(row) + "\t\t" + str(df_combined.loc[row, "Label"]))  

# Find the cells with NaN and after the rows for them
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

# Replace them
df_combined = df_combined.replace(np.nan, " ")

# Check the process
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

assert len(rows_with_NaN) is 0 

# Get column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

# 2D array creation for the news based on macros
COLUMNS = len(df_combined)
news_sum = [[0 for i in range(COLUMNS)] for j in range(int((len(combined_column_names) - 1) / ROWS))]  

# Merge the news
for row in range(len(df_combined)):
  for column in range(int((len(combined_column_names) - 1) / ROWS)):
    temp = ""
    news = ""
    for word in range(ROWS):
      news = df_combined[combined_column_names[(column * ROWS) + (word + 1)]][row]
      # Remove the b character at the begining of the string
      if news[0] is "b":
        news = " " + news[1:]
      temp = temp + " " + news
    news_sum[column][row] = temp

# Drop the old columns
for column in range(len(combined_column_names) - 1):
  df_combined.drop(combined_column_names[column + 1], axis = 1, inplace = True)

# Create the new columns with the merged news
for column in range(int((len(combined_column_names) - 1) / ROWS)):
  colum_name = "News_" + str(column + 1)
  df_combined[colum_name] = news_sum[column]

# The label column 
LABEL_COLUMN = 0

news_sum = []
label_sum = []

# Get the column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

# Write out the column names 
print(combined_column_names)
print("\n")

# Connect the merged news with the labels
for column in range(len(df_combined)):
  for row in range(len(combined_column_names) - 1):
    news_sum.append(df_combined[combined_column_names[row + 1]][column])
    label_sum.append(df_combined[combined_column_names[LABEL_COLUMN]][column])

# Create the new DataFrame
df_sum_news_labels = pd.DataFrame(data = label_sum, index = None, columns = ["Label"])
df_sum_news_labels["News"] = news_sum

# Removing punctuations
temp_news = []
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if word not in string.punctuation:
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

news_sum = temp_news
temp_news = []

# Remove numbers
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if not word.isdigit():
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

# Remove space
for line in range(len(temp_news)):    
  temp_news[line] = " ".join(temp_news[line].split())

# Converting headlines to lower case
for line in range(len(temp_news)): 
    temp_news[line] = temp_news[line].lower()

# Update the data frame
df_sum_news_labels["News"] = temp_news

# Load the stop words
stop_words = set(stopwords.words('english'))

filtered_sentence = []
news_sum = df_sum_news_labels["News"]

# Remove stop words
for line in news_sum:
  word_tokens = word_tokenize(line)
  temp_attach = ""
  for word in word_tokens:
    temp = " "
    if not word in stop_words:
      temp = temp + word
    temp_attach = temp_attach + "".join(temp)
  filtered_sentence.append(temp_attach)

# Remove space
for line in range(len(filtered_sentence)):    
  filtered_sentence[line] = " ".join(filtered_sentence[line].split())

# Update the data frame
df_sum_news_labels["News"] = filtered_sentence

news_sum = df_sum_news_labels["News"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1

print(null_indexes)

for row in null_indexes:
  df_sum_news_labels = df_sum_news_labels.drop(row)

news_sum = df_sum_news_labels["News"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1
  
assert len(null_indexes) is 0

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  df_sum_news_labels = shuffle(df_sum_news_labels, random_state = RANDOM_SEED)

# Reset the index
df_sum_news_labels.reset_index(inplace=True, drop=True)

INPUT_SIZE = len(df_sum_news_labels)
TRAIN_SIZE = int(TRAIN_SPLIT * INPUT_SIZE) 
TEST_SIZE = int(TEST_SPLIT * INPUT_SIZE)

# Split the dataset
train = df_sum_news_labels[:TRAIN_SIZE] 
test = df_sum_news_labels[TRAIN_SIZE:]

# Print out the length
print("Train data set length: " + str(len(train)))
print("Test data set length: " + str(len(test)))
print("Split summa: " + str(len(train) + len(test)))
print("Dataset summa before split: " + str(len(df_sum_news_labels)))

# check
split_sum = len(train) + len(test)
sum = len(df_sum_news_labels)
assert split_sum == sum

The lengths are the same!
The dates matched!
Problem at day 2010-10-14
Today: 11096.919921875		Yesterday: 11096.080078125		Label: 0

Problem at day 2012-11-12
Today: 12815.080078125		Yesterday: 12815.3896484375		Label: 0

Problem at day 2012-11-15
Today: 12570.9501953125		Yesterday: 12570.9501953125		Label: 0

Problem at day 2013-04-12
Today: 14865.0595703125		Yesterday: 14865.1396484375		Label: 0

Problem at day 2014-04-24
Today: 16501.650390625		Yesterday: 16501.650390625		Label: 0

Problem at day 2015-08-12
Today: 17402.509765625		Yesterday: 17402.83984375		Label: 0

Problem at day 2015-11-27
Today: 17813.390625		Yesterday: 17813.390625		Label: 0

All differences: 7
2010-10-14		1
2012-11-12		1
2012-11-15		1
2013-04-12		1
2014-04-24		1
2015-08-12		1
2015-11-27		1
['Label', 'News_1', 'News_2', 'News_3', 'News_4', 'News_5', 'News_6', 'News_7', 'News_8', 'News_9', 'News_10', 'News_11', 'News_12', 'News_13', 'News_14', 'News_15', 'News_16', 'News_17', 'News_18', 'News_19', 'News_20', 'Ne

In [16]:
train

Unnamed: 0,Label,News
0,1,talk like stupid sharia law uk
1,0,afghan woman killed giving birth girl
2,1,isis suspect turkish intelligence agency mt he...
3,1,american citizen living ossetia blames u georg...
4,1,anyone using google maps smartphone working su...
...,...,...
42254,1,false flag one men arrested illegally buying m...
42255,1,least dead cairo peaceful christian protest at...
42256,0,dna profiles innocent kept years despite europ...
42257,0,afghanistan march big picture


In [17]:
test

Unnamed: 0,Label,News
42259,0,daily kos tunisians thank anonymous north afri...
42260,0,china blocks guardian
42261,1,tiny island nation controls vast area pacific ...
42262,1,greek austerity moves leave nation economic sa...
42263,0,spanish prostitutes ordered wear reflective ve...
...,...,...
49712,0,forget prism global cyberchiefs meeting israel...
49713,1,bahrain expels u teacher inciting hatred royal...
49714,1,israel warns brazil accept settler ambassador
49715,1,oktoberfest pics


Az ECO datasetekkel bővítem a train adathalmazt.

In [18]:
# Copy the dataset to the local environment
!cp "/content/drive/MyDrive/Economist/ECO_BSN_DF.csv" "ECO_BSN_DF.csv"
!cp "/content/drive/MyDrive/Economist/ECO_FNC_DF.csv" "ECO_FNC_DF.csv"
!cp "/content/drive/MyDrive/Economist/ECO_US_DF.csv" "ECO_US_DF.csv"

# Load the datasets 
df_bsn = pd.read_csv('ECO_BSN_DF.csv', index_col = "date")
df_fnc = pd.read_csv('ECO_FNC_DF.csv', index_col = "date")
df_us = pd.read_csv('ECO_US_DF.csv', index_col = "date")

# Load the stock data
df_stock = web.DataReader("DJIA", data_source="yahoo", start="2008-08-08", end="2016-07-01")

# Sort
df_bsn_inspect = df_bsn[df_bsn.index < '2016/07/02']
df_bsn_inspect = df_bsn_inspect[df_bsn_inspect.index > '2008/08/07']
df_bsn_inspect = df_bsn_inspect.drop_duplicates()

df_fnc_inspect = df_fnc[df_fnc.index < '2016/07/02']
df_fnc_inspect = df_fnc_inspect[df_fnc_inspect.index > '2008/08/07']
df_fnc_inspect = df_fnc_inspect.drop_duplicates()

df_us_inspect = df_us[df_us.index < '2016/07/02']
df_us_inspect = df_us_inspect[df_us_inspect.index > '2008/08/07']
df_us_inspect = df_us_inspect.drop_duplicates()

# Merge
df_eco_all = pd.concat([df_bsn_inspect, df_fnc_inspect, df_us_inspect])

df_eco_all = df_eco_all.drop_duplicates()

df_eco_all.sort_index(ascending=True, inplace=True)

days = []
stock_days = []
wrong_days = []

# Create dates and remove duplicates
for day in range(len(df_eco_all.index)):
    if day == 0:
        days.append(str(df_eco_all.index[day]))
    elif df_eco_all.index[day] != days[len(days) - 1]:
        days.append(str(df_eco_all.index[day]))

# Drop not needed days
for day in range(len(df_stock.index)):
    stock_days.append(str(df_stock.index[day])[0:10].replace("-","/"))

# Remove not relevant date
good_days = []
for day in days:
    try:
        if stock_days.index(day):
            good_days.append(str(day))
    except:
        wrong_days.append(str(day))

label_eco = []
date_label_eco =[]
title_label_eco = []

for day in range(len(good_days)):
    if day == 0:
        title_label_eco.append(df_eco_all["title"][good_days[day]])
        label_eco.append(0)
        date_label_eco.append(good_days[day])      
    # label should be 1 -> rise
    elif int(df_stock["Adj Close"][stock_days.index(good_days[day])]) >= int(df_stock["Adj Close"][stock_days.index(good_days[day]) - 1]):   
        if isinstance(df_eco_all["title"][good_days[day]], str) is False:
            for row in df_eco_all["title"][good_days[day]]:
                title_label_eco.append(row)
                label_eco.append(1)
                date_label_eco.append(good_days[day])
        else:
                title_label_eco.append(df_eco_all["title"][good_days[day]])
                label_eco.append(1)
                date_label_eco.append(good_days[day])

    # label should be 0 -> fall
    elif int(df_stock["Adj Close"][stock_days.index(good_days[day])]) < int(df_stock["Adj Close"][stock_days.index(good_days[day]) - 1]):   
        if isinstance(df_eco_all["title"][good_days[day]], str) is False:
            for row in df_eco_all["title"][good_days[day]]:
                title_label_eco.append(row)
                label_eco.append(0)
                date_label_eco.append(good_days[day])
        else:
                title_label_eco.append(df_eco_all["title"][good_days[day]])
                label_eco.append(0)
                date_label_eco.append(good_days[day])

# Create dataset
df_eco = pd.DataFrame()
df_eco["date"] = date_label_eco
df_eco["label"] = label_eco
df_eco["title"] = title_label_eco
df_eco.set_index("date", inplace=True)
df_eco.sort_index(ascending=True, inplace=True)
print(df_eco.head())
print(len(df_eco))

# drop duplicates
df_eco.drop_duplicates(subset="title", inplace=True)

# Removing punctuations
temp_news = []
news_sum = df_eco["title"]

for line in news_sum:
  temp_attach = ""
  try:
      for word in line:
        temp = " "
        if word not in string.punctuation:
          temp = word
        temp_attach = temp_attach + "".join(temp)
  except:
      temp = " "
      temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

news_sum = temp_news
temp_news = []

# Remove numbers
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if not word.isdigit():
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

# Remove space
for line in range(len(temp_news)):    
  temp_news[line] = " ".join(temp_news[line].split())

# Converting headlines to lower case
for line in range(len(temp_news)): 
    temp_news[line] = temp_news[line].lower()

# Update the data frame
df_eco["title"] = temp_news

# Load the stop words
stop_words = set(stopwords.words('english'))

filtered_sentence = []
news_sum = df_eco["title"]

# Remove stop words
for line in news_sum:
  word_tokens = word_tokenize(line)
  temp_attach = ""
  for word in word_tokens:
    temp = " "
    if not word in stop_words:
      temp = temp + word
    temp_attach = temp_attach + "".join(temp)
  filtered_sentence.append(temp_attach)

# Remove space
for line in range(len(filtered_sentence)):    
  filtered_sentence[line] = " ".join(filtered_sentence[line].split())

# Update the data frame
df_eco["title"] = filtered_sentence

# Reset the index
df_eco.reset_index(inplace=True)

news_sum = df_eco["title"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1

print(null_indexes)

for row in range(len(null_indexes)):
  df_eco = df_eco.drop(df_eco.index[null_indexes[row] - row])

news_sum = df_eco["title"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1
  
assert len(null_indexes) is 0

# Drop the dates
df_eco_label_title = pd.DataFrame()
df_eco_label_title["label"] = df_eco["label"]
df_eco_label_title["title"] = df_eco["title"]
print("New dataset without the dates")
print(df_eco_label_title.head())
print(len(df_eco_label_title))

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  df_eco_label_title = shuffle(df_eco_label_title, random_state = RANDOM_SEED)

# Reset the index
df_eco_label_title.reset_index(inplace=True, drop=True)

            label                          title
date                                            
2008/08/12      0  Come fly the fee-filled skies
2008/08/14      1              Kicked in the ARS
2008/08/14      1          The next Billy Graham
2008/08/14      1     Another inconvenient truth
2008/08/14      1                 Phantom menace
5069
[79, 188, 722, 1417, 1983, 2716, 3105, 3214, 3855, 4094, 4335, 4950]
New dataset without the dates
   label                       title
0      0   come fly fee filled skies
1      1                  kicked ars
2      1           next billy graham
3      1  another inconvenient truth
4      1              phantom menace
5057


In [19]:
df_eco = pd.DataFrame()
df_eco["Label"] = df_eco_label_title["label"]
df_eco["News"] = df_eco_label_title["title"]
df_eco

Unnamed: 0,Label,News
0,0,close power
1,0,russian bears
2,1,money shot
3,1,stuck neutral
4,1,one listens jürgen grossmann
...,...,...
5052,1,called account
5053,0,revving bases
5054,0,unsolved
5055,1,china price


In [20]:
train_eco = pd.concat([train, df_eco])
train_eco

Unnamed: 0,Label,News
0,1,talk like stupid sharia law uk
1,0,afghan woman killed giving birth girl
2,1,isis suspect turkish intelligence agency mt he...
3,1,american citizen living ossetia blames u georg...
4,1,anyone using google maps smartphone working su...
...,...,...
5052,1,called account
5053,0,revving bases
5054,0,unsolved
5055,1,china price


A Benzingés data halmazzal bővítés.

In [24]:
# Copy the dataset to the local environment
!cp "/content/drive/MyDrive/Kaggle dataset/Benzinga news with ticker/KAG_BENZ_ANALYST_DF_1.csv" "KAG_BENZ_ANALYST_DF_1.csv"
!cp "/content/drive/MyDrive/Kaggle dataset/Benzinga news with ticker/KAG_BENZ_ANALYST_DF_2.csv" "KAG_BENZ_ANALYST_DF_2.csv"
!cp "/content/drive/MyDrive/Kaggle dataset/Benzinga news with ticker/KAG_BENZ_PARTNER_DF_1.csv" "KAG_BENZ_PARTNER_DF_1.csv"
!cp "/content/drive/MyDrive/Kaggle dataset/Benzinga news with ticker/KAG_BENZ_PARTNER_DF_2.csv" "KAG_BENZ_PARTNER_DF_2.csv"

# Load the datasets 
df_benz_1 = pd.read_csv('KAG_BENZ_ANALYST_DF_1.csv', index_col = "date")
df_benz_2 = pd.read_csv('KAG_BENZ_ANALYST_DF_2.csv', index_col = "date")    
df_partner_1 = pd.read_csv('KAG_BENZ_PARTNER_DF_1.csv', index_col = "date")
df_partner_2 = pd.read_csv('KAG_BENZ_PARTNER_DF_2.csv', index_col = "date")

# Load the stock data
df_stock = web.DataReader("DJIA", data_source="yahoo", start="2008-08-08", 
                          end="2016-07-01")

# Merge them
df_benz = pd.concat([df_benz_1, df_benz_2])
df_partner = pd.concat([df_partner_1, df_partner_2])

df_benz_inspect = df_benz[df_benz.index < '2016/07/02']
df_benz_inspect = df_benz_inspect[df_benz_inspect.index > '2008/08/07']
df_benz_inspect = df_benz_inspect.drop_duplicates()

df_partner_inspect = df_partner[df_partner.index < '2016/07/02']
df_partner_inspect = df_partner_inspect[df_partner_inspect.index > '2008/08/07']
df_partner_inspect = df_partner_inspect.drop_duplicates()

df_benz = pd.concat([df_benz_inspect, df_partner_inspect])

tickers = []
unique_count = []

# The stock tickers which is needed
stock_ticker = ["PG", "MMM", "IBM", "MRK", "AXP", "MCD", "BA", "KO", "CAT", "JPM",
                "DIS", "JNJ", "WMT", "HD", "INTC", "MSFT", "VZ", "CVX", "CSCO",
                "TRV", "UNH", "GS", "NKE", "V"]

df_benz_filtered = pd.DataFrame()

for stock in stock_ticker:
    df_temp = df_benz[(df_benz["stock"]) == stock].drop_duplicates()
    df_benz_filtered = pd.concat([df_benz_filtered, df_temp])

df_benz_filtered.sort_index(ascending=True, inplace=True)

df_benz = df_benz_filtered
df_benz.drop("stock", axis = 1, inplace = True)
df_benz.drop_duplicates(inplace=True)

# Removing punctuations
temp_news = []
news_sum = df_benz["headline"]

for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if word not in string.punctuation:
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

news_sum = temp_news
temp_news = []

# Remove numbers
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if not word.isdigit():
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

# Remove space
for line in range(len(temp_news)):    
  temp_news[line] = " ".join(temp_news[line].split())

# Converting headlines to lower case
for line in range(len(temp_news)): 
    temp_news[line] = temp_news[line].lower()

# Update the data frame
df_benz["headline"] = temp_news

# Load the stop words
stop_words = set(stopwords.words('english'))

filtered_sentence = []
news_sum = df_benz["headline"]

# Remove stop words
for line in news_sum:
  word_tokens = word_tokenize(line)
  temp_attach = ""
  for word in word_tokens:
    temp = " "
    if not word in stop_words:
      temp = temp + word
    temp_attach = temp_attach + "".join(temp)
  filtered_sentence.append(temp_attach)

# Remove space
for line in range(len(filtered_sentence)):    
  filtered_sentence[line] = " ".join(filtered_sentence[line].split())

# Update the data frame
df_benz["headline"] = filtered_sentence

days = []
stock_days = []
wrong_days = []

# Create dates and remove duplicates
for day in range(len(df_benz.index)):
    temp = str(df_benz.index[day])[0:10].replace("-","/")
    if day == 0:
        days.append(temp)
    elif df_benz.index[day] != df_benz.index[day - 1]:
        days.append(temp)

# Update the dataframe date column
df_benz.reset_index(inplace=True)
temp_days = df_benz["date"]
days_to_update = []
for date in range(len(temp_days)):
    temp = str(temp_days[date])[0:10].replace("-","/")
    days_to_update.append(temp)

df_benz["date"] = days_to_update
df_benz.set_index("date", inplace=True, drop=True)    

# Drop not needed days
for day in range(len(df_stock.index)):
    stock_days.append(str(df_stock.index[day])[0:10].replace("-","/"))

# Remove not relevant date
good_days = []
for day in days:
    try:
        if stock_days.index(day):
            good_days.append(str(day))
    except:
        wrong_days.append(str(day))

label_benz = []
date_label_benz =[]
title_label_benz = []

for day in range(len(good_days)):
    if day == 0:
        title_label_benz.append(df_benz["headline"][good_days[day]])
        label_benz.append(0)
        date_label_benz.append(good_days[day])      
    # label should be 1 -> rise
    elif int(df_stock["Adj Close"][stock_days.index(good_days[day])]) >= int(df_stock["Adj Close"][stock_days.index(good_days[day]) - 1]):   
        if isinstance(df_benz["headline"][good_days[day]], str) is False:
            for row in df_benz["headline"][good_days[day]]:
                title_label_benz.append(row)
                label_benz.append(1)
                date_label_benz.append(good_days[day])
        else:
                title_label_benz.append(df_benz["headline"][good_days[day]])
                label_benz.append(1)
                date_label_benz.append(good_days[day])

    # label should be 0 -> fall
    elif int(df_stock["Adj Close"][stock_days.index(good_days[day])]) < int(df_stock["Adj Close"][stock_days.index(good_days[day]) - 1]):   
        if isinstance(df_benz["headline"][good_days[day]], str) is False:
            for row in df_benz["headline"][good_days[day]]:
                title_label_benz.append(row)
                label_benz.append(0)
                date_label_benz.append(good_days[day])
        else:
                title_label_benz.append(df_benz["headline"][good_days[day]])
                label_benz.append(0)
                date_label_benz.append(good_days[day])

df_benz_temp = pd.DataFrame()
df_benz_temp["date"] = date_label_benz
df_benz_temp["label"] = label_benz
df_benz_temp["title"] = title_label_benz
df_benz_temp.set_index("date", inplace=True)
df_benz_temp.sort_index(ascending=True, inplace=True)

# Drop the dates
df_benz = pd.DataFrame()
df_benz["Label"] = df_benz_temp["label"]
df_benz["News"] = df_benz_temp["title"]
# Reset the index
df_benz.reset_index(inplace=True, drop=True)

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  df_benz = shuffle(df_benz, random_state = RANDOM_SEED)

# Reset the index
df_benz.reset_index(inplace=True, drop=True)

train_benz = pd.concat([train, df_benz])

In [25]:
train_benz

Unnamed: 0,Label,News
0,1,talk like stupid sharia law uk
1,0,afghan woman killed giving birth girl
2,1,isis suspect turkish intelligence agency mt he...
3,1,american citizen living ossetia blames u georg...
4,1,anyone using google maps smartphone working su...
...,...,...
12916,1,update johnson johnson announces definitive ag...
12917,1,mid day market update green mountain coffee sh...
12918,1,building something construction etf
12919,1,positive surprises negative guidance earnings ...


A három egyesítése.

In [26]:
train_all = pd.concat([train, df_eco, df_benz])
train_all

Unnamed: 0,Label,News
0,1,talk like stupid sharia law uk
1,0,afghan woman killed giving birth girl
2,1,isis suspect turkish intelligence agency mt he...
3,1,american citizen living ossetia blames u georg...
4,1,anyone using google maps smartphone working su...
...,...,...
12916,1,update johnson johnson announces definitive ag...
12917,1,mid day market update green mountain coffee sh...
12918,1,building something construction etf
12919,1,positive surprises negative guidance earnings ...


Az adathalmazok megkeverése és a hírek összefűzése az eredmények összehasonlításához.

Először az ECO-val bővített esetében.

In [43]:
# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train_eco = shuffle(train_eco, random_state = RANDOM_SEED)

# Reset the index
train_eco.reset_index(inplace=True, drop=True)

pos_label_news = []
neg_label_news = []

for row in range(len(train_eco)):
    if str(train_eco["Label"][row]) == "0":
        neg_label_news.append(str(train_eco["News"][row]))
    elif str(train_eco["Label"][row]) == "1":
        pos_label_news.append(str(train_eco["News"][row]))
    else:
        pass

print(len(pos_label_news))
print(len(neg_label_news))

pos_train = pd.DataFrame()
neg_train = pd.DataFrame()

pos_labels = []
for row in range(len(pos_label_news)):
    pos_labels.append("1")

pos_train["Label"] = pos_labels 
pos_train["News"] = pos_label_news 


neg_labels = []
for row in range(len(neg_label_news)):
    neg_labels.append("0")

neg_train["Label"] = neg_labels 
neg_train["News"] = neg_label_news 

# Neg merge
neg_merged_news = []
neg_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(neg_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + neg_train["News"][i* 8 + j]
    neg_merged_news.append(temp_news)
    neg_merged_labels.append(0)

neg_merged_df = pd.DataFrame()
neg_merged_df["Label"] = neg_merged_labels
neg_merged_df["News"] = neg_merged_news

# Pos merge
pos_merged_news = []
pos_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(pos_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + pos_train["News"][i* 8 + j]
    pos_merged_news.append(temp_news)
    pos_merged_labels.append(1)

pos_merged_df = pd.DataFrame()
pos_merged_df["Label"] = pos_merged_labels
pos_merged_df["News"] = pos_merged_news

# All merge
train_eco = pd.concat([pos_merged_df, neg_merged_df])

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train_eco = shuffle(train_eco, random_state = RANDOM_SEED)

# Reset the index
train_eco.reset_index(inplace=True, drop=True)

# Show the data frame
train_eco

3202
2712


Unnamed: 0,Label,News
0,1,slaves next door domestic slavery alive well...
1,1,archaeologist says prehistoric town unearthe...
2,1,smother invention syria ceasefire start febr...
3,0,kerry war ukraine want diplomatic solution c...
4,1,enters palestinian camp damascus jordan clos...
...,...,...
734,0,end compassionate conservatism david cameron...
735,1,year old german woman celebrates birthday co...
736,0,wikileaks cable leak guardian han chinese ba...
737,1,end lies george bush tell canadian hitman ad...


In [35]:
train_eco.groupby(["Label"]).describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2712,2712,international criminal court recognises rape ...,1
1,3202,3202,billion inhabitants india find anyone hangman...,1


Benzingás.

In [44]:
# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train_benz = shuffle(train_benz, random_state = RANDOM_SEED)

# Reset the index
train_benz.reset_index(inplace=True, drop=True)

pos_label_news = []
neg_label_news = []

for row in range(len(train_benz)):
    if str(train_benz["Label"][row]) == "0":
        neg_label_news.append(str(train_benz["News"][row]))
    elif str(train_benz["Label"][row]) == "1":
        pos_label_news.append(str(train_benz["News"][row]))
    else:
        pass

print(len(pos_label_news))
print(len(neg_label_news))

pos_train = pd.DataFrame()
neg_train = pd.DataFrame()

pos_labels = []
for row in range(len(pos_label_news)):
    pos_labels.append("1")

pos_train["Label"] = pos_labels 
pos_train["News"] = pos_label_news 


neg_labels = []
for row in range(len(neg_label_news)):
    neg_labels.append("0")

neg_train["Label"] = neg_labels 
neg_train["News"] = neg_label_news 

# Neg merge
neg_merged_news = []
neg_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(neg_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + neg_train["News"][i* 8 + j]
    neg_merged_news.append(temp_news)
    neg_merged_labels.append(0)

neg_merged_df = pd.DataFrame()
neg_merged_df["Label"] = neg_merged_labels
neg_merged_df["News"] = neg_merged_news

# Pos merge
pos_merged_news = []
pos_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(pos_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + pos_train["News"][i* 8 + j]
    pos_merged_news.append(temp_news)
    pos_merged_labels.append(1)

pos_merged_df = pd.DataFrame()
pos_merged_df["Label"] = pos_merged_labels
pos_merged_df["News"] = pos_merged_news

# All merge
train_benz = pd.concat([pos_merged_df, neg_merged_df])

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train_benz = shuffle(train_benz, random_state = RANDOM_SEED)

# Reset the index
train_benz.reset_index(inplace=True, drop=True)

# Show the data frame
train_benz

3721
3175


Unnamed: 0,Label,News
0,1,chinese banks hiding mother debt bombs china...
1,1,burning fireball seen coast canadian maritim...
2,0,israeli navy attacking civilian mercy ship u...
3,1,russia bend standoff west putin says rotherh...
4,0,piper jaffray discontinues coverage isph ato...
...,...,...
856,0,british police commissioner resigns due phon...
857,1,un confirms hezbollah fighting assad syria a...
858,1,oppenheimer prospects still bright home depo...
859,1,frequent flyer taliban senior leader mansour...


In [45]:
train_benz.groupby(["Label"]).describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,396,396,devote life spewing anti islamic hatred surp...,1
1,465,465,copenhagen climate change talks must fail sa...,1


Az összesített.

In [46]:
# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train_all = shuffle(train_all, random_state = RANDOM_SEED)

# Reset the index
train_all.reset_index(inplace=True, drop=True)

pos_label_news = []
neg_label_news = []

for row in range(len(train_all)):
    if str(train_all["Label"][row]) == "0":
        neg_label_news.append(str(train_all["News"][row]))
    elif str(train_all["Label"][row]) == "1":
        pos_label_news.append(str(train_all["News"][row]))
    else:
        pass

print(len(pos_label_news))
print(len(neg_label_news))

pos_train = pd.DataFrame()
neg_train = pd.DataFrame()

pos_labels = []
for row in range(len(pos_label_news)):
    pos_labels.append("1")

pos_train["News"] = pos_label_news 
pos_train["Label"] = pos_labels 


neg_labels = []
for row in range(len(neg_label_news)):
    neg_labels.append("0")

neg_train["Label"] = neg_labels 
neg_train["News"] = neg_label_news 

# Neg merge
neg_merged_news = []
neg_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(neg_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + neg_train["News"][i* 8 + j]
    neg_merged_news.append(temp_news)
    neg_merged_labels.append(0)

neg_merged_df = pd.DataFrame()
neg_merged_df["Label"] = neg_merged_labels
neg_merged_df["News"] = neg_merged_news

# Pos merge
pos_merged_news = []
pos_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(pos_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + pos_train["News"][i* 8 + j]
    pos_merged_news.append(temp_news)
    pos_merged_labels.append(1)

pos_merged_df = pd.DataFrame()
pos_merged_df["Label"] = pos_merged_labels
pos_merged_df["News"] = pos_merged_news

# All merge
train_all = pd.concat([pos_merged_df, neg_merged_df])

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train_all = shuffle(train_all, random_state = RANDOM_SEED)

# Reset the index
train_all.reset_index(inplace=True, drop=True)

# Show the data frame
train_all

4077
3452


Unnamed: 0,Label,News
0,1,popping question nigeria cease exist decembe...
1,1,update drexel hamilton raises travelers pt r...
2,1,appear sam bacile man behind anti islam film...
3,1,fed keeps interest rates zero afraid unless ...
4,1,mohamed elbaradei says ready assume power eg...
...,...,...
935,1,isis militants blow ancient arch triumph pal...
936,1,update russian regulator says conducting saf...
937,0,ukraine crisis us recognise crimea referendu...
938,1,volcano erupts glacier iceland hundreds evac...


In [47]:
train_all.groupby(["Label"]).describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,431,431,europe top human rights court ruled poland v...,1
1,509,509,iran plane black boxes damaged p g vice chai...,1


Az alap halmaz összefűzése.

In [48]:
# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train = shuffle(train, random_state = RANDOM_SEED)

# Reset the index
train.reset_index(inplace=True, drop=True)

pos_label_news = []
neg_label_news = []

for row in range(len(train)):
    if str(train["Label"][row]) == "0":
        neg_label_news.append(str(train["News"][row]))
    elif str(train["Label"][row]) == "1":
        pos_label_news.append(str(train["News"][row]))
    else:
        pass

print(len(pos_label_news))
print(len(neg_label_news))

pos_train = pd.DataFrame()
neg_train = pd.DataFrame()

pos_labels = []
for row in range(len(pos_label_news)):
    pos_labels.append("1")

pos_train["Label"] = pos_labels 
pos_train["News"] = pos_label_news 


neg_labels = []
for row in range(len(neg_label_news)):
    neg_labels.append("0")

neg_train["Label"] = neg_labels 
neg_train["News"] = neg_label_news 

# Neg merge
neg_merged_news = []
neg_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(neg_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + neg_train["News"][i* 8 + j]
    neg_merged_news.append(temp_news)
    neg_merged_labels.append(0)

neg_merged_df = pd.DataFrame()
neg_merged_df["Label"] = neg_merged_labels
neg_merged_df["News"] = neg_merged_news

# Pos merge
pos_merged_news = []
pos_merged_labels = []
in_rows_counter = 0
merged_counter = 0

for i in range(int(len(pos_train) / 8)):
    temp_news = ""
    for j in range(8): #0,1...7
        temp_news = temp_news + " " + pos_train["News"][i* 8 + j]
    pos_merged_news.append(temp_news)
    pos_merged_labels.append(1)

pos_merged_df = pd.DataFrame()
pos_merged_df["Label"] = pos_merged_labels
pos_merged_df["News"] = pos_merged_news

# All merge
train = pd.concat([pos_merged_df, neg_merged_df])

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  train = shuffle(train, random_state = RANDOM_SEED)

# Reset the index
train.reset_index(inplace=True, drop=True)

# Show the data frame
train

2846
2435


Unnamed: 0,Label,News
0,0,petraeus suggests afghans burned children ex...
1,1,giant buddha found afghan site uk denies jul...
2,0,doctor helped track osama jailed physician h...
3,1,north korea scraps south korea military safe...
4,1,first unlooted royal tomb kind unearthed per...
...,...,...
654,0,egypt women brandish knives sex assault prot...
655,1,india joins u effort stifle iran trade south...
656,0,official banks europe may seize deposits cov...
657,0,french authorities investigate alleged posio...


In [49]:
train.groupby(["Label"]).describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,304,304,iran says longer afford ahmadinejad cash han...,1
1,355,355,u accuses soccer officials decades rampant s...,1


##**A modellek futtatása**

In [102]:
datasets = [train, train_eco, train_benz, train_all]
index = ["basic", "w Eco", "w Benz", "w All"]
result_accuraccy = []
result_coeff = []

for dataset_index in range(len(index)):
    MODEL_TYPE = str("5,5")
    print(str(index[dataset_index]) + "\t" + str(MODEL_TYPE) +  " n-gram logreg model\n")

    train_headlines = []
    test_headlines = []

    for row in range(0, len(datasets[dataset_index].index)):
        train_headlines.append(datasets[dataset_index].iloc[row, 1])

    for row in range(0,len(test.index)):
        test_headlines.append(test.iloc[row, 1])

    # show the first
    print(train_headlines[0] + "\n")

    _gram_vectorizer_ = CountVectorizer(ngram_range=(int(MODEL_TYPE[0]),int(MODEL_TYPE[2])))
    _train_vectorizer_ = _gram_vectorizer_.fit_transform(train_headlines)

    print("The shape is: " + str(_train_vectorizer_.shape) + "\n")

    _gram_model_ = LogisticRegression(random_state=RANDOM_SEED, max_iter=MAX_ITER)
    _gram_model_ = _gram_model_.fit(_train_vectorizer_, datasets[dataset_index]["Label"])

    _gram_test_ = _gram_vectorizer_.transform(test_headlines)
    _gram_predictions_ = _gram_model_.predict(_gram_test_)

    result_accuraccy.append(accuracy_score(test["Label"], _gram_predictions_))

    _gram_words_best_ = _gram_vectorizer_.get_feature_names()
    _gram_coeffs_best_ = _gram_model_.coef_.tolist()[0]

    coeffdf = pd.DataFrame({'Word' : _gram_words_best_, 
                            'Coefficient' : _gram_coeffs_best_})

    coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])

    result_coeff.append(coeffdf)

basic	5,5 n-gram logreg model

  petraeus suggests afghans burned children exaggerate civilian casualty claims nasa building largest rocket time launch turkish police started arresting people tweeting facebooking protests local news cnn turk needs translation indigenous groups keystone xl pipeline cross lands native american communities promising fierce resistance stop transcanada building amp president barack obama permitting northern leg keystone japan nuclear body says radioactive water fukushima emergency food riots break algeria china trying failing censor citizens want answers tianjin explosion china three gorges dam close maximum capacity heavy rains persist chinese officials years boasted dam could withstand floods severe come every years  india gets new view us collection agents call centers hear tales woe land whose lifestyles idealized afghan police marksmanship ak always poor u contractors failed train adjust sights missile parts mh crash site gazans still without tap water

Az eredmények kiértékelése.

In [103]:
for dataset_index in range(len(index)):
    print(str(index[dataset_index]) + ":\t\t\t" + str(result_accuraccy[dataset_index]) + "\n")

basic:			0.5386162510056315

w Eco:			0.5386162510056315

w Benz:			0.5386162510056315

w All:			0.5386162510056315



In [65]:
for dataset_index in range(len(index)):
    print(str(index[dataset_index]) + ":\n" + 
          str(result_coeff[dataset_index].head(10)) + "\n" + 
          str(result_coeff[dataset_index].tail(10))+ "\n\n")

basic:
             Word  Coefficient
8525        egypt     0.314796
639           air     0.311879
22304      rebels     0.251282
14329        jews     0.248128
20890  population     0.247687
18869      number     0.243741
17711     mubarak     0.239704
25141       since     0.229664
16867         men     0.228331
28611          uk     0.227752
            Word  Coefficient
15988        low    -0.219907
1866   authority    -0.221489
30340    without    -0.224357
12775      hours    -0.232240
23910  sanctions    -0.237713
13092    illegal    -0.238293
2531     beijing    -0.249492
14127    israeli    -0.253480
23685        run    -0.268893
11812    hacking    -0.283535


w Eco:
             Word  Coefficient
19402         non     0.318194
18469     mubarak     0.291355
8906        egypt     0.281462
16935        make     0.279366
26212       since     0.267088
30224  university     0.263639
562      agencies     0.257238
1536     arrested     0.241431
31255  washington     0.237456
166