# Research Question 2 - Adam 

##### import libraries + read file

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import date

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from collections import Counter

df = pd.read_json('evaluate_news.json')
print(df.shape)
df.head()

(303893, 4)


Unnamed: 0,text,title,pub_time,labels
0,"PLANO, Texas, Dec. 8, 2020 /PRNewswire/ --Euro...",European Wax Center Welcomes Jennifer Vanderve...,2020-12-08 09:00:00-05:00,"{'ticker': 'MIK', 'start_time': '2020-12-08 09..."
1,"CHARLOTTE, N.C., Oct. 1, 2020 /PRNewswire/ --D...","Duke Energy to host virtual Environmental, Soc...",2020-10-01 12:11:00-04:00,"{'ticker': 'DUK', 'start_time': '2020-10-01 12..."
2,"SOUTH SAN FRANCISCO, Calif., Oct. 5, 2020 /PRN...",Pliant Therapeutics Appoints Mike Ouimette as ...,2020-10-05 08:00:00-04:00,"{'ticker': 'PLRX', 'start_time': '2020-10-05 0..."
3,"NEW YORK, Jan. 18, 2021 /PRNewswire/ -- With a...",Radiofrequency Ablation Devices Market Revenue...,2021-01-18 02:30:00-05:00,{'ticker': ''}
4,"ENGLEWOOD, Colo., Feb. 22, 2021 /PRNewswire/ -...","DISH Network reports fourth quarter, year-end ...",2021-02-22 06:05:00-05:00,"{'ticker': 'DISH', 'start_time': '2021-02-22 0..."


## Part 1: finding the most important words 
We will find the most important words for positive and negative outcomes based on stocks. 

We will use 3 methods of feature engineering: TF-IDF via Transformer, TF-IDF via Vectorizer, and a bag-of-words model.

In [19]:

#Get the labels, normalize them, and concatenate to the dataframe. Then, add a new column for the overall baseline change from start day 1 to end of day 3.
labels = df["labels"]
labels = pd.json_normalize(labels)
data = pd.concat([df,labels],axis=1)
data.dropna()
data["baselinePctChng3"] = ((data["end_price_3day"] - data["start_price_open"]) / data["start_price_open"])*100


In [20]:
data.head()

Unnamed: 0,text,title,pub_time,labels,ticker,start_time,start_price_open,start_price_close,end_price_1day,end_price_2day,...,highest_time_1day,highest_time_2day,highest_time_3day,lowest_price_1day,lowest_price_2day,lowest_price_3day,lowest_time_1day,lowest_time_2day,lowest_time_3day,baselinePctChng3
0,"PLANO, Texas, Dec. 8, 2020 /PRNewswire/ --Euro...",European Wax Center Welcomes Jennifer Vanderve...,2020-12-08 09:00:00-05:00,"{'ticker': 'MIK', 'start_time': '2020-12-08 09...",MIK,2020-12-08 09:00:00-05:00,12.07,12.07,12.8,12.4899,...,2020-12-08 10:12:00-05:00,2020-12-08 10:12:00-05:00,2020-12-08 10:12:00-05:00,11.98,11.98,11.98,2020-12-08 09:13:00-05:00,2020-12-08 09:13:00-05:00,2020-12-08 09:13:00-05:00,7.705054
1,"CHARLOTTE, N.C., Oct. 1, 2020 /PRNewswire/ --D...","Duke Energy to host virtual Environmental, Soc...",2020-10-01 12:11:00-04:00,"{'ticker': 'DUK', 'start_time': '2020-10-01 12...",DUK,2020-10-01 12:11:00-04:00,89.74,89.78,90.05,91.0,...,2020-10-01 15:21:00-04:00,2020-10-02 15:32:00-04:00,2020-10-02 15:32:00-04:00,89.18,88.65,88.65,2020-10-01 14:03:00-04:00,2020-10-02 09:29:00-04:00,2020-10-02 09:29:00-04:00,2.518386
2,"SOUTH SAN FRANCISCO, Calif., Oct. 5, 2020 /PRN...",Pliant Therapeutics Appoints Mike Ouimette as ...,2020-10-05 08:00:00-04:00,"{'ticker': 'PLRX', 'start_time': '2020-10-05 0...",PLRX,2020-10-05 09:29:00-04:00,20.0,20.0,21.43,21.92,...,2020-10-05 15:50:00-04:00,2020-10-06 09:35:00-04:00,2020-10-07 15:28:00-04:00,19.73,19.73,19.73,2020-10-05 09:57:00-04:00,2020-10-05 09:57:00-04:00,2020-10-05 09:57:00-04:00,21.75
3,"NEW YORK, Jan. 18, 2021 /PRNewswire/ -- With a...",Radiofrequency Ablation Devices Market Revenue...,2021-01-18 02:30:00-05:00,{'ticker': ''},,,,,,,...,,,,,,,,,,
4,"ENGLEWOOD, Colo., Feb. 22, 2021 /PRNewswire/ -...","DISH Network reports fourth quarter, year-end ...",2021-02-22 06:05:00-05:00,"{'ticker': 'DISH', 'start_time': '2021-02-22 0...",DISH,2021-02-22 06:09:00-05:00,34.25,34.25,32.0,30.85,...,2021-02-22 06:37:00-05:00,2021-02-22 06:37:00-05:00,2021-02-22 06:37:00-05:00,31.25,29.71,29.71,2021-02-22 18:54:00-05:00,2021-02-23 12:09:00-05:00,2021-02-23 12:09:00-05:00,-8.321168


In [21]:
data.shape

(303893, 27)

In [59]:
#random sample
dataShuffled = data.sample(frac=1)
dataPos = dataShuffled[dataShuffled["baselinePctChng3"] > 0]
dataNeg = dataShuffled[dataShuffled["baselinePctChng3"] < 0]

posX = dataPos.sample(frac = 10000/57031, random_state=1)
negX = dataNeg.sample(frac = 10000/48955,random_state=1)

In [51]:
#TF-IDF with transformer

sentencesPositive = []
wordsPositive = []

for index, row in posX.iterrows():
    sentence = row['title']
    sentencesPositive.append(sentence)

print(len(sentencesPositive))

countVector = CountVectorizer()
wordCountVector = countVector.fit_transform(sentencesPositive)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(wordCountVector)

matrix = countVector.transform(sentencesPositive)
tf_idf_vector=tfidf_transformer.transform(matrix)

feature_names = countVector.get_feature_names_out() 
first_document_vector=tf_idf_vector[0] 

df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

10000


Unnamed: 0,tfidf
technological,0.333493
advancement,0.330038
innovations,0.303571
model,0.283734
construction,0.280454
...,...
engagement,0.000000
engaged,0.000000
engage20,0.000000
engage,0.000000


In [52]:
posDict = dict(zip(df.index, df['tfidf']))

In [53]:
#TF-IDF with vectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 

#transform, send in
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(sentencesPositive)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 

#place in dataframe, sort by highest weight.
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
technological,0.333493
advancement,0.330038
innovations,0.303571
model,0.283734
construction,0.280454
...,...
engagement,0.000000
engaged,0.000000
engage20,0.000000
engage,0.000000


# ------------

In [60]:
#TF-IDF with transformer
sentencesNegative = []
wordsPositive = []

for index, row in negX.iterrows():
    sentence = row['title']
    sentencesNegative.append(sentence)

print(len(sentencesNegative))

countVector = CountVectorizer()
wordCountVector = countVector.fit_transform(sentencesNegative)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(wordCountVector)

matrix = countVector.transform(sentencesNegative)
tf_idf_vector=tfidf_transformer.transform(matrix)

feature_names = countVector.get_feature_names_out() 
first_document_vector=tf_idf_vector[0] 

df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

10000


Unnamed: 0,tfidf
clarus,0.420881
jury,0.402951
infringement,0.380360
lipocine,0.372298
patent,0.304175
...,...
empowerment,0.000000
empowering,0.000000
empowered,0.000000
empower,0.000000


In [61]:
negDict = dict(zip(df.index, df['tfidf']))

In [62]:
#TF-IDF with vectorizer

# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 

# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(sentencesNegative)
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 

# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
clarus,0.420881
jury,0.402951
infringement,0.380360
lipocine,0.372298
patent,0.304175
...,...
empowerment,0.000000
empowering,0.000000
empowered,0.000000
empower,0.000000


# ------------

In [29]:
# #most important words without TF-IDF
# nltk.download('stopwords')
# stopWords = set(stopwords.words('english'))

# cleanSentences = []
# importantWords = []
# for sentence in sentencesPositive: #SWAP FOR SENTENCESNEGATIVE.
#     words = sentence.split()
#     filteredWords = [word for word in words if word.lower() not in stopWords]
#     cleanSentence = ' '.join(filteredWords)
#     cleanSentences.append(cleanSentence)


# wordCounter = {}
# for sentence in cleanSentences:
#     words = sentence.split()
#     word_scores = {}
#     for word in words:
#         # Calculate word score as the product of its frequency and length
#         score = len(word) * words.count(word)
#         word_scores[word] = score
#     most_important_word = max(word_scores, key=word_scores.get)
#     if most_important_word in wordCounter:
#         wordCounter[most_important_word] += 1
#     else:
#         wordCounter[most_important_word] = 1

# for sentence, count in wordCounter.items():
#     print(f"{sentence}: {count}")


# # Remove key-value pairings where the key contains the substring ".com"
# to_delete = []
# for key in wordCounter.keys():
#     if ".com" in key or "https:" in key:
#         to_delete.append(key)
# for key in to_delete:
#     del wordCounter[key]


# top_words = sorted(wordCounter.items(), key=lambda x: x[1], reverse=True)[:20]

# print(top_words)

The Bag of words model was producing values that were not valid / comparable, as such, it has been commented out.

In [63]:
wordsInBoth = []
for key in posDict.keys():
    if key in negDict.keys() and posDict[key] > 0.2 and negDict[key] > 0.1:
        wordsInBoth.append(key)

In [64]:
print(wordsInBoth)

[]


Across multiple runs, only Therapeutics and 1 other word appears on both with a threshold of 0.1 / 1.0 . A score of 0.1 is very small, so this means there isn't a very clear cut answer to what is positive or negative based on purely features and the TF-IDF scores on both + / - results.

One major consideration is that each word comes from a different industry; one could extrapolate the causation when referring back to the original media and seeing what type of the 11 reports it was.

# ------------

## Part 2 - validating results using a model

We will be using an SVM model to determine whether a simple model can determine whether a word's presence is likely to be positive or negative, or is correlated with outside factors. 

In [32]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.stem import SnowballStemmer

import gensim.downloader as api
import re

Note. the strings are too long to be directly converted to floats, so the gensim library will be used 

In [33]:
def cleaning(string):
    string = string.lower()

    # remove punctuation using regex
    string = re.sub(r'[^\w\s]', '', string)
    
    # tokenize the string
    tokens = string.split()

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # join the filtered stuff back into a string
    return " ".join(filtered_tokens)

In [34]:
#determine number of samples of + and -
numSamples = 5000
X = dataPos.sample(frac = numSamples/57031, random_state=1)
Y = dataNeg.sample(frac = numSamples/48955,random_state=1)

#we create labels for + / - outcomes as a binary 0/1.
XLabel = np.ones((numSamples,), dtype=int)
YLabel = np.zeros((numSamples,), dtype=int)

XLabel = XLabel.tolist()
YLabel = YLabel.tolist()

labels = XLabel + YLabel


modelSentences = []

#here we pre-process, cleaning the titles and appending back to a list.
for index, row in X.iterrows():
    tempsent = row['title']
    tempsent = cleaning(tempsent)
    modelSentences.append(tempsent)

for index, row in Y.iterrows():
    tempsent = row['title']
    tempsent = cleaning(tempsent)
    modelSentences.append(tempsent)


model = api.load("glove-wiki-gigaword-100")

# Feature extraction
embedding_dim = 100
Z = np.zeros((len(modelSentences), embedding_dim))

for i, title in enumerate(modelSentences):
    vectors = [model.get_vector(word) for word in title if word in model.key_to_index]
    if vectors:
        Z[i, :] = np.mean(vectors, axis=0)
        
print("About to Split!")

#actual modelling
X_train, X_test, y_train, y_test = train_test_split(Z, labels, test_size=0.2, random_state=42)

svm = LinearSVC()
svm.fit(X_train, y_train)

# Evaluation
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

# vectorizer = CountVectorizer()
# modelVectorizer = vectorizer.fit_transform(modelSentences)
# X_train, X_test, y_train, y_test = train_test_split(modelSentences, labels, test_size=0.2, random_state=42)
# clf = SVC(kernel='linear')
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred))


About to Split!
Accuracy: 0.5225
Precision: 0.5285857572718154
Recall: 0.5207509881422925
F1 score: 0.5246391239422598


In [66]:
X_train, X_test, y_train, y_test = train_test_split(Z, labels, test_size=0.2, random_state=42)

svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)

# Evaluation
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.4945
Precision: 0.5028901734104047
Recall: 0.08596837944664032
F1 score: 0.1468354430379747
