In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer 
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from joblib import dump, load
import ast

# set of stopwords from NLTK 
stops = set(stopwords.words('english'))

In [6]:
def preprocess(text):
    '''
    preprocess the provided text by removing stopwords and non-alphabetical characters.
    
    parameters:
    text (str): the input text string that needs to be cleaned and processed.
    
    returns:
    str: the processed text with stopwords removed and only alphabetical characters retained.
    '''
    
    sentence_list = []  # initialize an empty list to hold the processed words

    # convert text to lower case and split into words
    for word in text.lower().split():
        # check if the word is not a stopword
        if word not in stops:

            word_list = []  

            # iterate over each character in the word
            for char in word:
                # check if the character is an alphabetical character
                if char.isalpha():
                    word_list.append(char)  

            # if word_list is not empty after removing non-alphabetical characters
            if len(word_list) != 0:
                sentence_list.append(''.join(word_list)) 

    # join all processed words to form the processed sentence
    return ' '.join(sentence_list)


In [4]:
df_final = pd.read_csv('stockNews-1.csv')


In [8]:
df1 = pd.read_csv('paragraph-train.csv')

# drop irrelevant columns and nulls
df = df1.drop(columns=['Unnamed: 0', 'ticker', 'link'])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# apply preprocessing and create boolean columns with numerical val 
df['relevant'] = df['relevant'].apply(lambda x: 1 if x == True else 0)
# df['paragraph'] = df['paragraph'].apply(lambda x: preprocess(x))
df

Unnamed: 0,paragraph,relevant
0,"In a report released yesterday, Joshua Tilton ...",1
1,"According to TipRanks, Tilton is a 4-star anal...",1
2,"Currently, the analyst consensus on Salesforce...",1
3,The company has a one-year high of $318.30 an...,1
4,Based on the recent corporate insider activity...,1
...,...,...
5066,The bank noted that the valuation of TTWO stoc...,1
5067,The enterprise/EBITDA ratio of TTWO is a rathe...,1
5068,"On the date of publication, Larry Ramer did no...",0
5069,Larry Ramer has conducted research and written...,0


In [9]:
# Transform the text data into TF-IDF vectors, excluding English stop words.
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df.paragraph)
y = df.relevant

# Split the data into training and test subsets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training data to handle class imbalance.
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define the parameters for GridSearchCV.
param_grid = {
    'C': np.logspace(-4, 4, 50),
    'penalty': ['l1', 'l2']
}

# Initialize and configure the logistic regression model with GridSearchCV.
model = LogisticRegression()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, verbose=True, n_jobs=-1)

# Train the model with the balanced dataset.
model = grid_search.fit(X_train_smote, y_train_smote)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [10]:
# predict the test set results
y_pred = model.predict(X_test)

# print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# print the accuracy of the model on the test set
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       307
           1       0.99      0.99      0.99      1215

    accuracy                           0.98      1522
   macro avg       0.97      0.97      0.97      1522
weighted avg       0.98      0.98      0.98      1522

Accuracy Score: 0.980946123521682


In [11]:
def verify_logreg(model, lst, vectorizer):
    """
    processes and predicts classes for a list of text paragraphs using a trained logistic regression model,
    returning a concatenated string of paragraphs predicted as non-zero by the model.

    parameters:
    model (logisticregression): the trained logistic regression model.
    lst (list of str): the list of text paragraphs to process and predict.
    vectorizer (tfidfvectorizer): the vectorizer used to transform text data into the appropriate format for the model.

    returns:
    str: a single string composed of all paragraphs that the model predicts as class non-zero, concatenated together.
    """

    # preprocess the list of paragraphs
    preprocessed_list = [preprocess(para) for para in lst]
    
    # transform the preprocessed text data into vectors
    X = vectorizer.transform(preprocessed_list)

    # predict the classes of the vectorized paragraphs
    y_pred = model.predict(X)
    
    # filter and concatenate paragraphs that are predicted as non-zero
    new = [para for idx, para in enumerate(lst) if y_pred[idx] != 0]
    
    return ' '.join(new)


In [12]:
df_final = pd.read_csv('stockNews-1.csv')

# convert the string representation of list back into a list using ast.literal_eval
df_final['paragraphList'] = df_final['paragraphList'].apply(lambda x: ast.literal_eval(x))

# apply the verify_logreg function to each list of paragraphs
df_final['articleInfo'] = df_final['paragraphList'].apply(lambda x: verify_logreg(model, x, vectorizer))


In [17]:
# convert df to csv 
df_final.to_csv('FinalStockNews.csv')