# Clasificación de Noticias - NLP

## Preprocesamiento, Vectorización, Modelado, Sentimiento y Predicción

In [2]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pickle

# Descargar recursos
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/alberto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Cargar datasets
train_df = pd.read_csv('dataset/training_data.csv', sep='\t', header=None, names=["label", "headline"])
test_df = pd.read_csv('dataset/testing_data.csv', sep='\t', header=None, names=["label", "headline"])

print(train_df.head())
print(test_df.head())

   label                                           headline
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...
  label                                           headline
0     2  copycat muslim terrorist arrested with assault...
1     2  wow! chicago protester caught on camera admits...
2     2   germany's fdp look to fill schaeuble's big shoes
4     2  u.n. seeks 'massive' aid boost amid rohingya '...


In [4]:
print(train_df.columns)
print(test_df.columns)

Index(['label', 'headline'], dtype='object')
Index(['label', 'headline'], dtype='object')


In [5]:
# Preprocesamiento
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

train_df['clean_headline'] = train_df['headline'].apply(preprocess_text)
test_df['clean_headline'] = test_df['headline'].apply(preprocess_text)

In [6]:
# Vectorización
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_df['clean_headline'])
X_test = vectorizer.transform(test_df['clean_headline'])
y = train_df['label']

In [None]:
# Entrenamiento de modelos
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))

model = lr  # Seleccionamos el mejor

Logistic Regression Accuracy: 0.9319279754062363


In [None]:
import os
import pickle

# Crear la carpeta models si no existe
os.makedirs("models", exist_ok=True)

# Guardar modelo
model_path = os.path.join("models", "model.pkl")
with open(model_path, "wb") as f:
    pickle.dump(model, f)

# Guardar vectorizador
vectorizer_path = os.path.join("models", "vectorizer.pkl")
with open(vectorizer_path, "wb") as f:
    pickle.dump(vectorizer, f)

print(f"✅ Modelo guardado en {model_path}")
print(f"✅ Vectorizador guardado en {vectorizer_path}")

In [None]:
# Análisis de sentimiento
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

train_df['sentiment_score'] = train_df['headline'].apply(analyze_sentiment)
train_df.head()

In [None]:
# Predicción sobre test
predictions = model.predict(X_test)

# Añadir las predicciones al dataframe
test_df['label'] = predictions

# Crear carpeta dataset si no existe
import os
os.makedirs("dataset", exist_ok=True)

# Guardar el archivo en la carpeta dataset
output_path = os.path.join("dataset", "testing_data_predictions.csv")
test_df[['headline', 'label']].to_csv(output_path, index=False)

print(f"✅ Archivo de predicciones generado en {output_path}")