In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer 
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from joblib import dump, load
import ast

# set of stopwords from NLTK 
stops = set(stopwords.words('english'))

In [9]:
def preprocess(text):
    
    sentence_list = []
    for word in text.lower().split():
        if word not in stops:

            word_list = []
            for char in word:
                if char.isalpha():
                    word_list.append(char)
            if len(word_list) != 0:
                sentence_list.append(''.join(word_list))
    return ' '.join(sentence_list)

In [23]:
# unit test
assert preprocess("A true random number generator (TRNG), also known as a hardware random number generator (HRNG), does not use a computer algorithm. Instead, it uses an external unpredictable physical variable such as radioactive decay of isotopes or airwave static to generate random numbers.") == 'true random number generator trng also known hardware random number generator hrng use computer algorithm instead uses external unpredictable physical variable radioactive decay isotopes airwave static generate random numbers'
assert preprocess("This IS A test !!!! I hope this makes SENSE") == 'test hope makes sense'

In [10]:
df1 = pd.read_csv('paragraph-1.csv')

# drop irrelevant columns and nulls
df = df1.drop(columns=['Unnamed: 0', 'ticker', 'link'])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# apply preprocessing and create boolean columns with numerical val 
df['relevant'] = df['relevant'].apply(lambda x: 1 if x == True else 0)
# df['paragraph'] = df['paragraph'].apply(lambda x: preprocess(x))
df

Unnamed: 0,paragraph,relevant
0,The commercial real estate industry continues ...,1
1,Elevated interest rates and a widespread trend...,1
2,"On an earnings call on Monday, Goldman Sachs G...",1
3,The sector has become especially sensitive to ...,1
4,"Just days later, S&P Global Inc (NYSE:SPGI) de...",1
...,...,...
28336,"""Distributing apps directly from a website req...",1
28337,"Side-loading, which will come into effect from...",1
28338,The smartphone maker further notified that app...,1
28339,"Additionally, Apple insisted on a core technol...",1


In [11]:
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(df.paragraph)
y = df.relevant

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

param_grid = {
    'C': np.logspace(-4, 4, 50),  # Regularization strength
    'penalty': ['l1', 'l2']  # Types of regularization
}

# Initialize the model
model = LogisticRegression()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, verbose=True, n_jobs=-1)

# Fit model on training data
model = grid_search.fit(X_train_smote, y_train_smote);

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


500 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Rachel/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Rachel/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/Rachel/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/Rachel/Library/Python/3.9/lib/python/site

In [12]:
print(np.bincount(y_train))
print(np.bincount(y_train_smote))

[ 3684 16154]
[16154 16154]


In [13]:
y_pred = model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1557
           1       0.99      0.99      0.99      6946

    accuracy                           0.99      8503
   macro avg       0.98      0.98      0.98      8503
weighted avg       0.99      0.99      0.99      8503

Accuracy Score: 0.9896507115135834


In [14]:
def verify_logreg(model, lst, vectorizer):

    preprocessed_list = [preprocess(para) for para in lst]
    
    # Transform all preprocessed paragraphs using the already fitted vectorizer
    X = vectorizer.transform(preprocessed_list)

    # Predict using the logistic regression model
    y_pred = model.predict(X)
    
    # Append paragraphs where the prediction is 0
    new = [para for idx, para in enumerate(lst) if y_pred[idx] != 0]
    
    # Return the combined text of the filtered paragraphs
    return ' '.join(new)

In [26]:
# unit tests for verify_logreg
preprocLst1 = ["hello want", "hello kitty", "talk about real life"]
assert verify_logreg(model, preprocLst1, vectorizer) == 'hello want talk about real life'
preprocLst2 = ["world burn alive hot", "environmental change"]
assert verify_logreg(model, preprocLst2, vectorizer) == 'environmental change'

In [15]:
df_final = pd.read_csv('stockNews-1.csv')
df_final['paragraphList'] = df_final['paragraphList'].apply(lambda x: ast.literal_eval(x))
df_final['articleInfo'] = df_final['paragraphList'].apply(lambda x: verify_logreg(model, x, vectorizer))

In [16]:
df_final.to_csv('FinalStockNews.csv')