In [1]:
#Import the libraries we’ll be using throughout our notebook:
import pandas as pd
import numpy as np
import csv
import math
import pandas as pd
from nltk.corpus import stopwords as nltk_stopwords
from gensim.models.hdpmodel import HdpModel
from gensim.corpora import Dictionary
import re
import plotly.express as px
import gensim.matutils as matutils
from sklearn.decomposition import SparsePCA
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier



In [2]:
#Text Cleaning and Preprocessing
def preprocessing(text):
    stops = nltk_stopwords.words('english')

    text = text.lower()

    # remove emails
    text = re.sub('\S*@\S*\s?', ' ', text)

    # remove numbers and dates
    text = re.sub('\$?[0-9]+[\.]?[0-9]*s?%?\$?\s?', ' ', text)

    # remove hastags
    text = re.sub('#\S*\s?', ' ', text)

    # remove https
    text = re.sub('https://\S*\s?', ' ', text)

    # remove http
    text = re.sub('http://\S*\s?', ' ', text)

    for x in [",", ":", "!", "?", ";", "[", "]",
              "(", ")", "\"", "\'", ".", "\"",
              "#", "@", "&", "`", "'", "’", "-",
              "+", "=", "_", "<", ">", "\\",
              "|", "}", "{", "/", "—", "$", "“", "”"]:
        text = text.replace(x, "")
    text = text.split()
    cleaned_text = []
    for word in text:
        if not (word in stops):
            cleaned_text.append(word)
    text = cleaned_text
    return text

In [24]:
#Read and Inspect the Data
news = pd.read_csv('yahoo_finance_marked_df.csv')

In [25]:
news

Unnamed: 0,mark,date,news
0,0,2021-10-01,"TAIPEI, Oct 1 (Reuters) - Resolving the global..."
1,0,2021-10-01,(Bloomberg) -- Asian stocks and U.S. futures s...
2,0,2021-10-01,"TOKYO, Oct 1 (Reuters) - Japanese shares tumbl..."
3,0,2021-10-01,By Sonali PaulMELBOURNE (Reuters) - Oil prices...
4,0,2021-10-01,By Kevin BucklandTOKYO (Reuters) - Asian equit...
...,...,...,...
1377,0,2021-09-17,"Anastasia Amoroso, iCapital Network Chief Inve..."
1378,0,2021-09-15,The Bank of England expects inflation to hit 4...
1379,0,2021-09-14,The major Asia-Pacific stock indexes traded mi...
1380,0,2021-09-14,(Bloomberg) -- Japanese stocks advanced for a ...


In [26]:
df = news.copy()
df['news'] = df['news'].apply(lambda text: preprocessing(text))
print(df.head())

   mark        date                                               news
0     0  2021-10-01  [taipei, oct, reuters, resolving, global, shor...
1     0  2021-10-01  [bloomberg, asian, stocks, us, futures, starte...
2     0  2021-10-01  [tokyo, oct, reuters, japanese, shares, tumble...
3     0  2021-10-01  [sonali, paulmelbourne, reuters, oil, prices, ...
4     0  2021-10-01  [kevin, bucklandtokyo, reuters, asian, equitie...


In [27]:
df['news']

0       [taipei, oct, reuters, resolving, global, shor...
1       [bloomberg, asian, stocks, us, futures, starte...
2       [tokyo, oct, reuters, japanese, shares, tumble...
3       [sonali, paulmelbourne, reuters, oil, prices, ...
4       [kevin, bucklandtokyo, reuters, asian, equitie...
                              ...                        
1377    [anastasia, amoroso, icapital, network, chief,...
1378    [bank, england, expects, inflation, hit, end, ...
1379    [major, asiapacific, stock, indexes, traded, m...
1380    [bloomberg, japanese, stocks, advanced, third,...
1381    [financial, stock, mining, firms, leading, gai...
Name: news, Length: 1382, dtype: object

In [28]:
NER = pd.read_csv('yahoo_NER.csv')

In [29]:
NER['name']

0              TAIPEI
1            Malaysia
2            COVID-19
3              Taiwan
4        Wang Mei-hua
             ...     
69468           Oanda
69469           Asian
69470     Wall Street
69471         Chanthu
69472        Shanghai
Name: name, Length: 69473, dtype: object

In [30]:
NERnorm = set(NER['name'])

In [31]:
NERnorm

{'Naeem Aslam',
 'Sorare',
 'Aston Martin DB4 GT Zagato Coupe',
 'Jacob Frey.',
 'Bruces',
 'Avastin',
 'Naked Brand Group',
 'Secret Service',
 'Transportation Security Administration',
 'Craig-Hallum',
 'BNB Cash',
 'October.Mark Wahlberg',
 'LCpl Hunter Clark',
 'Q & A go',
 'level.Exxon Mobil',
 'Esther Choo',
 'Bremmer',
 'Do Kwon',
 "dollars'Healthcare",
 'Kristoffer Kjær Lomholt',
 'Valencia',
 'Build it Back Better',
 'Chipotle Mexican Grill',
 "Glimmer of hope'Grayscale",
 'Kelton J. Cochran',
 'John McCain',
 'Stemmle',
 'Switzerland-based',
 'Qdoba',
 'Ammann',
 'Ashish Jha',
 'rules.Sheila Bair',
 'Department for Education',
 'Natasha Bhuyan.',
 'SBEC',
 'Sears Home Services',
 'Oliver Daemen',
 'Daily Beast',
 'Sens',
 'Coinbase Cloud',
 'service.Sheila Bair',
 'Salvador',
 'FTSE 250',
 'LabCorp',
 'Canadian',
 'Del Rio',
 'Jovan Vavic',
 'LUNAtics',
 'Bowery Hotel',
 'Romney',
 'Wharton',
 'Toby Chopra',
 'Ipsos',
 'Jared',
 'Gumbinger',
 'Jason Redmond',
 'Shiba Inu',
 '

In [32]:
NERlist = list(NERnorm)

In [11]:
ner = []

In [34]:
with open("yahoo_finance_marked_df.csv","w") as file:
    writer = csv.writer(file)
    writer.writerow(NERlist)

In [35]:
x = pd.read_csv('yahoo_finance_marked_df.csv')

In [36]:
x

Unnamed: 0,Naeem Aslam,Sorare,Aston Martin DB4 GT Zagato Coupe,Jacob Frey.,Bruces,Avastin,Naked Brand Group,Secret Service,Transportation Security Administration,Craig-Hallum,...,Noah Kerner,Ed Bastian,Diane Yentel,Dania Maxwell,BTC Media.Padraic,Honda,Democrat-controlled,Gina McCarthy,Social Security and Medicare trust,Senate Banking Committee


In [39]:
array = []

In [40]:
#binary embedding
with open("yahoo_finance_marked_df.csv","a") as file:
        for j in range(len(df['news'])):
            for word in NERlist:
                a = 0
                for line in df['news'][j]:
                    if word in line:
                        a = (a + 1)/(a + 1)
                    else:
                        a =a + 0
                array.append(a)
            writer = csv.writer(file)
            writer.writerow(array)
            #print(array)   
            array.clear()