In [1]:
import numpy as np
import pandas as pd
from src.data_utils import NewsProcessor
from src.embeddings import Word2VecEmbeddingGenerator


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = "data/raw/signalmedia-1m_challenge_dataset/signal-1m-nasa.jsonl"

In [3]:
data_processor = NewsProcessor()

news_df = data_processor.load_data(file_path=file_path)
news_df = data_processor.preprocess_text(data=news_df)

news_df = data_processor.filter_by_keyword(news_df, target_keywords=["nasa", "space", "rocket"])
print('no. of filtered articles : ', len(news_df))

no. of filtered articles :  3881


In [4]:
news_df = data_processor.generate_polarity(news_df)
news_df = data_processor.generate_labels(news_df)

In [5]:
print('no of articles with positive sentiment : ', np.where(news_df.label == 2)[0].shape)
print('no of articles with negative sentiment : ', np.where(news_df.label == 1)[0].shape)
print('no of articles with neutral sentiment : ', np.where(news_df.label == 0)[0].shape)

no of articles with positive sentiment :  (23,)
no of articles with negative sentiment :  (16,)
no of articles with neutral sentiment :  (3842,)


In [6]:
embedding_generator = Word2VecEmbeddingGenerator(model_path = "data/models/models--fse--word2vec-google-news-300/snapshots/528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model")

Loading pre-trained Word2Vec model : data/models/models--fse--word2vec-google-news-300/snapshots/528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model
Pre-trained Word2Vec model loaded successfully!


In [7]:
tokenized_texts = [content.split() for content in news_df["content"]]

In [8]:
embeddings = embedding_generator.generate_embeddings(tokenized_texts)

In [9]:
labels = news_df.get("label", [0] * len(news_df))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42, stratify=labels)

In [11]:
np.unique(y_train)

array([0, 1, 2])

In [12]:
np.unique(y_test)

array([0, 1, 2])

In [13]:
model = RandomForestClassifier().fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)
print("Random Forest Evaluation:")
print(classification_report(y_test, y_pred, target_names=['Neutral', 'Negative', 'Positive']))


Random Forest Evaluation:
              precision    recall  f1-score   support

     Neutral       0.99      1.00      1.00       769
    Negative       0.00      0.00      0.00         3
    Positive       1.00      0.20      0.33         5

    accuracy                           0.99       777
   macro avg       0.66      0.40      0.44       777
weighted avg       0.99      0.99      0.99       777



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
from src.bert import BERTClassifier

In [7]:

# Train and Evaluate BERT model
bert_classifier = BERTClassifier()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
bert_trainer = bert_classifier.train(news_df["content"].tolist(), labels.tolist())



Epoch,Training Loss,Validation Loss
