## Training GBV labeled dataset

In [12]:
# Used libraries
import pandas as pd
import unicodedata
import numpy as np
import openpyxl
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [5]:
# Load the labeled data
file_path = 'gbv_df.xlsx'
gbv_df = pd.read_excel(file_path)

# Inspect the data
print(gbv_df.head())

                                                link state  \
0  https://web.archive.org/web/20200901174745/htt...  CHIH   
1  https://web.archive.org/web/20200721132743/htt...  CHIH   
2    http://laopcion.com.mx/noticia/98812?archivo=si  CHIH   
3  https://web.archive.org/web/20200901181614/htt...  CHIH   
4  https://web.archive.org/web/20200901184921/htt...  CHIH   

                                               title      frame  
0  Imparte fiscalía pláticas preventivas a emplea...   Temático  
1  La atropella su pareja y la deja lesionada al ...  Episódico  
2  Detienen a chofer de camión urbano por hostiga...  Episódico  
3  Inaugura Duarte Centro de Salud y Albergue Cie...   Temático  
4  Presentan la conferencia La grandeza de ser mu...   Temático  


In [6]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the 'title' column
gbv_df['cleaned_title'] = gbv_df['title'].apply(preprocess_text)

# Display the cleaned text
print(gbv_df[['title', 'cleaned_title']].head())

                                               title  \
0  Imparte fiscalía pláticas preventivas a emplea...   
1  La atropella su pareja y la deja lesionada al ...   
2  Detienen a chofer de camión urbano por hostiga...   
3  Inaugura Duarte Centro de Salud y Albergue Cie...   
4  Presentan la conferencia La grandeza de ser mu...   

                                       cleaned_title  
0  imparte fiscala plticas preventivas a empleado...  
1  la atropella su pareja y la deja lesionada al ...  
2  detienen a chofer de camin urbano por hostigam...  
3  inaugura duarte centro de salud y albergue cie...  
4  presentan la conferencia la grandeza de ser mu...  


In [13]:
# Function to clean frame labels and remove accents
def clean_labels(label):
    # Normalize the text to decompose accents
    label = unicodedata.normalize('NFD', label)
    # Remove diacritics (accents) by filtering characters
    label = ''.join(char for char in label if unicodedata.category(char) != 'Mn')
    # Convert to lowercase and strip whitespace
    label = label.lower().strip()
    return label

# Apply the cleaning function to the 'frame' column
gbv_df['frames'] = gbv_df['frame'].apply(clean_labels)

In [21]:
# Define features (X) and labels (y)
X = gbv_df['cleaned_title']
y = gbv_df['frames']  

# Encode labels if necessary
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Converts to numerical values
print(label_encoder.classes_)  # Check the mapping

['episodico' 'tematico']


In [22]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

Training samples: 777, Test samples: 195


In [23]:
# Use TF-IDF for feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit the vectorizer on training data and transform both training and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF matrix shape (training): {X_train_tfidf.shape}")

TF-IDF matrix shape (training): (777, 1775)


In [24]:
# Train a logistic regression model
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.9128205128205128
Classification Report:
               precision    recall  f1-score   support

   episodico       0.91      0.93      0.92       103
    tematico       0.92      0.89      0.91        92

    accuracy                           0.91       195
   macro avg       0.91      0.91      0.91       195
weighted avg       0.91      0.91      0.91       195



## Scraping for more news titles

### 8 Columnas

In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL and keywords
base_url = "https://8columnas.com.mx/page/{}/?s={}"
keywords = ["mujer", "niña", "violencia de genero", "feminicidio"]

In [27]:
# Function to extract titles from a single page
def extract_titles(page_url):
    response = requests.get(page_url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {page_url}")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Assuming article titles are within <h2> or similar tags
    # Adjust the selector as per the website's structure
    titles = soup.select('h2.entry-title a')  # Example: change this selector if needed
    return [title.get_text(strip=True) for title in titles]

# Iterate through pages and keywords
all_data = []
for keyword in keywords:
    for page in range(1, 11):  # First 10 pages
        url = base_url.format(page, keyword)
        print(f"Fetching: {url}")
        titles = extract_titles(url)
        
        # Append results with keyword information
        for title in titles:
            all_data.append({"keyword": keyword, "title": title})

# Convert the results to a DataFrame and save to a CSV
df = pd.DataFrame(all_data)
df.to_csv('8c_titles.csv', index=False)
print("Scraping completed. Titles saved to extracted_titles.csv.")

Fetching: https://8columnas.com.mx/page/1/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/1/?s=mujer
Fetching: https://8columnas.com.mx/page/2/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/2/?s=mujer
Fetching: https://8columnas.com.mx/page/3/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/3/?s=mujer
Fetching: https://8columnas.com.mx/page/4/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/4/?s=mujer
Fetching: https://8columnas.com.mx/page/5/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/5/?s=mujer
Fetching: https://8columnas.com.mx/page/6/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/6/?s=mujer
Fetching: https://8columnas.com.mx/page/7/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/7/?s=mujer
Fetching: https://8columnas.com.mx/page/8/?s=mujer
Failed to fetch page: https://8columnas.com.mx/page/8/?s=mujer
Fetching: https://8columnas.com.mx/page/9/?s=mujer
Failed to fetch page: https://8column

### La Opción de Chihuahua