# Import Libraries, Read in .json

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import date

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer


df = pd.read_json('evaluate_news.json')
print(df.shape)
df.head()

# Preprocessing, Developing New Columns
Get the labels, find the change from start price day 1 to end of day 3. 
Drop unavailable ticker data.

In [None]:

#Get the labels, normalize them, and concatenate to the dataframe. Then, add a new column for the overall baseline change from start day 1 to end of day 3.
labels = df["labels"]
labels = pd.json_normalize(labels)
data = pd.concat([df,labels],axis=1)
data["baselinePctChng3"] = ((data["end_price_3day"] - data["start_price_open"]) / data["start_price_open"])*100
data.dropna()

In [None]:
data.head()

In [None]:
data.shape

# Develop the Corpus
Entries: 57,031 total with positive, 48,955 total entries with negative.


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer 


In [None]:
dataShuffled = data.sample(frac=1)
dataPos = dataShuffled[dataShuffled["baselinePctChng3"] > 0]
dataNeg = dataShuffled[dataShuffled["baselinePctChng3"] < 0]
# print(len(dataPos))
# print(len(dataNeg))

posX = dataPos.sample(frac = 10000/57031, random_state=1)
negX = dataNeg.sample(frac = 10000/48955,random_state=1)
# print(posX.shape)
# print(negX.shape)

# Determine most important words using TF-IDF w/transformer, TF-IDF w/vectorizer, and purely based on frequency (no model, just math)

typically, If you need the term frequency (term count) vectors for different tasks, use transformer. If you need to compute tf-idf scores on documents within your “training” dataset, use vectorizer.

Note: results got the same 'top' words.

### Positives

In [None]:
#TF-IDF with transformer

sentencesPositive = []
wordsPositive = []

for index, row in posX.iterrows():
    sentence = row['title']
    sentencesPositive.append(sentence)

print(len(sentencesPositive))

countVector = CountVectorizer()
wordCountVector = countVector.fit_transform(sentencesPositive)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(wordCountVector)

matrix = countVector.transform(sentencesPositive)
tf_idf_vector=tfidf_transformer.transform(matrix)

feature_names = countVector.get_feature_names_out() 
first_document_vector=tf_idf_vector[0] 

df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
posDict = dict(zip(df.index, df['tfidf']))

In [None]:
#TF-IDF with vectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 

#transform, send in
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(sentencesPositive)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 

#place in dataframe, sort by highest weight.
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

### Negatives

In [None]:
#TF-IDF with transformer
sentencesNegative = []
wordsPositive = []

for index, row in negX.iterrows():
    sentence = row['title']
    sentencesNegative.append(sentence)

print(len(sentencesNegative))

countVector = CountVectorizer()
wordCountVector = countVector.fit_transform(sentencesNegative)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(wordCountVector)

matrix = countVector.transform(sentencesNegative)
tf_idf_vector=tfidf_transformer.transform(matrix)

feature_names = countVector.get_feature_names_out() 
first_document_vector=tf_idf_vector[0] 

df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
negDict = dict(zip(df.index, df['tfidf']))

In [None]:
#TF-IDF with vectorizer

# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 

# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(sentencesNegative)
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 

# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

We now have 2 dictionaries with word - TF-IDF pairings. TF-IDF scores range from 0 --> 1, where 1 is a heavy importance weighting and 0 is less important.

Now that we have the top words for both positive and negative, we check for overlap with a threshold of .1 in the TF-IDF to see which words are important to a stock's drop and rise but also who's meanings depend on the surrounding words. Such words would be thrown into a model such as 'BERT' for future reference.

In [None]:
wordsInBoth = []
for key in posDict.keys():
    if key in negDict.keys() and posDict[key] > 0.2 and negDict[key] > 0.1:
        wordsInBoth.append(key)

In [None]:
print(wordsInBoth)

Only Therapeutics appears on both with a threshold of 0.1, which alludes to there being a 'clear' division between use of words and their related stock price changes.

One major consideration is that each word comes from a different industry; one could extrapolate the causation when referring back to the original media and seeing what type of the 11 reports it was.

### Preliminary work

- first cell identifies the most important word in each sentence.
- second cell identifies a 'non' TF-IDF method of determining word frequency and most important words. It was determined not as powerful as the TF-IDF models.

In [None]:
#OTHER:
# # Identify the most important word in each sentence
# for sentence in cleanSentences:
#     words = sentence.split()
#     word_scores = {}
#     for word in words:
#         # Calculate word score as the product of its frequency and length
#         score = len(word) * words.count(word)
#         word_scores[word] = score
#     mostImportantWordInSentence = max(word_scores, key=word_scores.get)
#     importantWords.append(mostImportantWordInSentence)
#     print(mostImportantWordInSentence)

In [None]:
#most important words without TF-IDF
# nltk.download('stopwords')
# stopWords = set(stopwords.words('english'))

# cleanSentences = []
# importantWords = []
# for sentence in sentencesPositive: #SWAP FOR SENTENCESNEGATIVE.
#     words = sentence.split()
#     filteredWords = [word for word in words if word.lower() not in stopWords]
#     cleanSentence = ' '.join(filteredWords)
#     cleanSentences.append(cleanSentence)


# wordCounter = {}
# for sentence in cleanSentences:
#     words = sentence.split()
#     word_scores = {}
#     for word in words:
#         # Calculate word score as the product of its frequency and length
#         score = len(word) * words.count(word)
#         word_scores[word] = score
#     most_important_word = max(word_scores, key=word_scores.get)
#     if most_important_word in wordCounter:
#         wordCounter[most_important_word] += 1
#     else:
#         wordCounter[most_important_word] = 1

# for sentence, count in wordCounter.items():
#     print(f"{sentence}: {count}")


# # Remove key-value pairings where the key contains the substring ".com"
# to_delete = []
# for key in wordCounter.keys():
#     if ".com" in key or "https:" in key:
#         to_delete.append(key)
# for key in to_delete:
#     del wordCounter[key]


# top_words = sorted(wordCounter.items(), key=lambda x: x[1], reverse=True)[:20]

# print(top_words)