In [4]:
import sys
import os
from pathlib import Path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
from news_project.dataset import *
import matplotlib.pyplot as plt
import seaborn as sns
from news_project.process import * 

In [5]:
# Define the path to the dataset
file_path = Path("../data/raw/news.csv")
df = load_data(file_path)
df.head()

[32m2025-02-25 15:22:46.165[0m | [1mINFO    [0m | [36mnews_project.dataset[0m:[36mload_data[0m:[36m15[0m - [1mAttempting to load data from: ..\data\raw\news.csv[0m
[32m2025-02-25 15:22:47.449[0m | [1mINFO    [0m | [36mnews_project.dataset[0m:[36mload_data[0m:[36m19[0m - [1mSuccessfully loaded dataset with 427482 rows and 7 columns.[0m
[32m2025-02-25 15:22:47.449[0m | [1mINFO    [0m | [36mnews_project.dataset[0m:[36mload_data[0m:[36m20[0m - [1mColumns in dataset: ['date', 'news', 'neg', 'neu', 'pos', 'compound', 'sentiment'][0m


Unnamed: 0,date,news,neg,neu,pos,compound,sentiment
0,2007-07-07,It was a long antipodean night. While there’s ...,0.059,0.878,0.064,0.0516,POSITIVE
1,2007-07-07,In Mexico there are no licensing or registrati...,0.044,0.956,0.0,-0.296,NEGATIVE
2,2007-07-07,The government has until Monday to protect the...,0.0,0.894,0.106,0.3818,POSITIVE
3,2007-07-07,A record-breaking heat wave in the Southwest i...,0.197505,0.66149,0.141005,0.997491,POSITIVE
4,2007-07-07,England started its Live Earth concert at Wemb...,0.033,0.945,0.022,-0.1779,NEGATIVE


In [6]:
df_cleaned = remove_duplicates(df,"news")

[32m2025-02-25 15:22:47.554[0m | [1mINFO    [0m | [36mnews_project.process[0m:[36mremove_duplicates[0m:[36m18[0m - [1mNumber of duplicate rows in 'news' before removal: 12762[0m
[32m2025-02-25 15:22:47.646[0m | [1mINFO    [0m | [36mnews_project.process[0m:[36mremove_duplicates[0m:[36m24[0m - [1mDataset shape after duplicate removal: (414720, 7)[0m


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 427482 entries, 0 to 427481
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   date       427482 non-null  object 
 1   news       427482 non-null  object 
 2   neg        427482 non-null  float64
 3   neu        427482 non-null  float64
 4   pos        427482 non-null  float64
 5   compound   427482 non-null  float64
 6   sentiment  427482 non-null  object 
dtypes: float64(4), object(3)
memory usage: 22.8+ MB


In [8]:
null_counts = df.isnull().sum()
print(null_counts)


date         0
news         0
neg          0
neu          0
pos          0
compound     0
sentiment    0
dtype: int64


In [9]:
num_duplicates = df.duplicated().sum()
print(num_duplicates)


0


In [10]:
numeric_col=df.select_dtypes(include=['int64','float64']).columns.tolist()
categoric_col=df.select_dtypes(include=['object']).columns.tolist()

In [11]:
class_counts = df[categoric_col].nunique()  # Count unique values per column
print(class_counts)

date           5892
news         414720
sentiment         2
dtype: int64


In [12]:
class_counts_numeric = df[numeric_col].nunique()  # Count unique values per column
print(class_counts_numeric)

neg         111977
neu         111309
pos         111826
compound     98818
dtype: int64


In [13]:
sentiment_mapping = {"NEGATIVE": 1, "POSITIVE": 0}
df_cleaned["sentiment_encoded"] = df_cleaned["sentiment"].map(sentiment_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["sentiment_encoded"] = df_cleaned["sentiment"].map(sentiment_mapping)


In [14]:
df_cleaned.head()

Unnamed: 0,date,news,neg,neu,pos,compound,sentiment,sentiment_encoded
0,2007-07-07,It was a long antipodean night. While there’s ...,0.059,0.878,0.064,0.0516,POSITIVE,0
1,2007-07-07,In Mexico there are no licensing or registrati...,0.044,0.956,0.0,-0.296,NEGATIVE,1
2,2007-07-07,The government has until Monday to protect the...,0.0,0.894,0.106,0.3818,POSITIVE,0
3,2007-07-07,A record-breaking heat wave in the Southwest i...,0.197505,0.66149,0.141005,0.997491,POSITIVE,0
4,2007-07-07,England started its Live Earth concert at Wemb...,0.033,0.945,0.022,-0.1779,NEGATIVE,1


In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed

# ✅ 1. Initialize & Fit TF-IDF Vectorizer (Remove n_jobs)
vectorizer = TfidfVectorizer(max_features=20000)

# ✅ 2. Fit TF-IDF only on the "news" column
vectorizer.fit(df_cleaned["news"])

# ✅ 3. Parallel Processing (Transform Large Dataset Faster)
def process_text(text):
    return vectorizer.transform([text])

news_tfidf_parallel = Parallel(n_jobs=-1)(delayed(process_text)(text) for text in df_cleaned["news"])

# Convert the parallel-processed sparse matrix into a single matrix
import scipy.sparse
news_tfidf = scipy.sparse.vstack(news_tfidf_parallel)  # Stack transformed rows

# ✅ 4. Convert to DataFrame
df_tfidf = pd.DataFrame(news_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# ✅ 5. Add the Target Column
df_tfidf["sentiment_encoded"] = df_cleaned["sentiment_encoded"]
 
# ✅ 6. Compute Correlation
correlation = df_tfidf.corr()["sentiment_encoded"].sort_values(ascending=False)

# ✅ 7. Print Results
print("Top correlated words with sentiment:")
print(correlation.head(10))  # Show top 10 correlated words

# ✅ 8. Display TF-IDF Transformed Data
print(df_tfidf.head())


KeyboardInterrupt: 

In [None]:
import joblib

# ✅ Save the TF-IDF vectorizer
joblib.dump(vectorizer, "news_project/models/vectorizer_models/tfidf_vectorizer.pkl")
print("TF-IDF Vectorizer saved successfully!")
# ✅ Save the LabelEncoder

# ✅ Save the TF-IDF Transformed Data for future use
df_tfidf.to_csv("data/interim/tfidf_transformed_data.csv", index=False)
print("TF-IDF transformed data saved successfully!")

# ✅ Save the transformed sparse matrix (alternative to CSV for large datasets)
joblib.dump(news_tfidf, "news_project/models/tfidf_transformed/tfidf_transformed_matrix.pkl")
print("Sparse TF-IDF matrix saved successfully!")

In [None]:
data = df_tfidf.drop(["sentiment_encoded"], axis=1)
target = df_tfidf["sentiment_encoded"]

In [None]:
from news_project.modeling.train import *

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [None]:
models_df = pd.DataFrame()
trained_model = {}

In [None]:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from xgboost import XGBRegressor
import joblib


In [None]:
# Test different models
models_df,trained_models = train_and_evaluate(LinearRegression(), X_train, y_train, X_test, y_test, models_df,trained_model)
models_df,trained_models = train_and_evaluate(Ridge(alpha=1.0), X_train, y_train, X_test, y_test, models_df,trained_model)
models_df,trained_models = train_and_evaluate(RandomForestRegressor(n_estimators=100), X_train, y_train, X_test, y_test, models_df,trained_model)
models_df,trained_models = train_and_evaluate(ElasticNet(alpha=1.0, l1_ratio=0.5), X_train, y_train, X_test, y_test, models_df,trained_model)
models_df,trained_models = train_and_evaluate(Lasso(alpha=1.0), X_train, y_train, X_test, y_test, models_df,trained_model)
models_df,trained_models = train_and_evaluate(SVR(kernel="rbf", C=1.0, epsilon=0.1), X_train, y_train, X_test, y_test, models_df,trained_model)
# models_df,trained_models = train_and_evaluate(XGBRegressor(n_estimators=100, learning_rate=0.1), X_train, y_train, X_test, y_test, models_df,trained_model)
display(models_df)  # Shows the DataFrame in Jupyter Notebook

In [None]:
from news_project.process import *

In [None]:
def clean_news_text(text):
    text = remove_byte_prefix(text)
    text = normalize_quotes(text)
    text = remove_special_chars(text)
    text = remove_urls(text)
    text = to_lowercase(text)
    # Finally, tokenize + remove stopwords + lemmatize
    text = tokenize_lemmatize(text)
    return text
df_cleaned["cleaned_news"] = df_cleaned["news"].apply(clean_news_text)
df_cleaned = df_cleaned[df_cleaned["cleaned_news"].str.strip().astype(bool)]
