# Training a GBV classifier from a previously labeled dataset

In [12]:
# Used libraries
import pandas as pd
import unicodedata
import numpy as np
import openpyxl
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [5]:
# Load the labeled data
file_path = 'gbv_df.xlsx'
gbv_df = pd.read_excel(file_path)

# Inspect the data
print(gbv_df.head())

                                                link state  \
0  https://web.archive.org/web/20200901174745/htt...  CHIH   
1  https://web.archive.org/web/20200721132743/htt...  CHIH   
2    http://laopcion.com.mx/noticia/98812?archivo=si  CHIH   
3  https://web.archive.org/web/20200901181614/htt...  CHIH   
4  https://web.archive.org/web/20200901184921/htt...  CHIH   

                                               title      frame  
0  Imparte fiscalía pláticas preventivas a emplea...   Temático  
1  La atropella su pareja y la deja lesionada al ...  Episódico  
2  Detienen a chofer de camión urbano por hostiga...  Episódico  
3  Inaugura Duarte Centro de Salud y Albergue Cie...   Temático  
4  Presentan la conferencia La grandeza de ser mu...   Temático  


In [6]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the 'title' column
gbv_df['cleaned_title'] = gbv_df['title'].apply(preprocess_text)

# Display the cleaned text
print(gbv_df[['title', 'cleaned_title']].head())

                                               title  \
0  Imparte fiscalía pláticas preventivas a emplea...   
1  La atropella su pareja y la deja lesionada al ...   
2  Detienen a chofer de camión urbano por hostiga...   
3  Inaugura Duarte Centro de Salud y Albergue Cie...   
4  Presentan la conferencia La grandeza de ser mu...   

                                       cleaned_title  
0  imparte fiscala plticas preventivas a empleado...  
1  la atropella su pareja y la deja lesionada al ...  
2  detienen a chofer de camin urbano por hostigam...  
3  inaugura duarte centro de salud y albergue cie...  
4  presentan la conferencia la grandeza de ser mu...  


In [13]:
# Function to clean frame labels and remove accents
def clean_labels(label):
    # Normalize the text to decompose accents
    label = unicodedata.normalize('NFD', label)
    # Remove diacritics (accents) by filtering characters
    label = ''.join(char for char in label if unicodedata.category(char) != 'Mn')
    # Convert to lowercase and strip whitespace
    label = label.lower().strip()
    return label

# Apply the cleaning function to the 'frame' column
gbv_df['frames'] = gbv_df['frame'].apply(clean_labels)

In [21]:
# Define features (X) and labels (y)
X = gbv_df['cleaned_title']
y = gbv_df['frames']  

# Encode labels if necessary
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Converts to numerical values
print(label_encoder.classes_)  # Check the mapping

['episodico' 'tematico']


In [22]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

Training samples: 777, Test samples: 195


In [23]:
# Use TF-IDF for feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit the vectorizer on training data and transform both training and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF matrix shape (training): {X_train_tfidf.shape}")

TF-IDF matrix shape (training): (777, 1775)


In [24]:
# Train a logistic regression model
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.9128205128205128
Classification Report:
               precision    recall  f1-score   support

   episodico       0.91      0.93      0.92       103
    tematico       0.92      0.89      0.91        92

    accuracy                           0.91       195
   macro avg       0.91      0.91      0.91       195
weighted avg       0.91      0.91      0.91       195



In [94]:
import joblib

# Save the trained model and vectorizer
joblib.dump(logreg, 'logreg_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# Scraping for more news titles

# 8 Columnas

In [44]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [56]:
## Successfull scraping

import requests
from bs4 import BeautifulSoup

def extract_titles_and_links(base_url, max_pages=10):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    }
    results = []

    # Iterate through pages
    for page_number in range(1, max_pages + 1):
        # Construct URL based on page number
        if page_number == 1:
            url = base_url
        else:
            url = f"https://8columnas.com.mx/page/{page_number}/?s=mujer"  # Adjust for pagination
        
        print(f"Fetching: {url}")
        response = requests.get(url, headers=headers)
        
        # Check for response errors
        if response.status_code != 200:
            print(f"Failed to fetch page: {url}, Status Code: {response.status_code}")
            continue
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("h3", class_="entry-title")
        
        # Extract titles and links
        for article in articles:
            title_tag = article.find("a", href=True)
            if title_tag:
                title = title_tag.get_text(strip=True)
                link = title_tag["href"]
                results.append({"link": link, "title": title, "state": "edo"})
    
    return results

# Save data to CSV
def save_to_csv(data, filename="8c_titles_mujer.csv"):
    import csv
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["link", "title", "state"])
        writer.writeheader()
        writer.writerows(data)

# Main Function
if __name__ == "__main__":
    # Base URL includes the keyword
    base_url = "https://8columnas.com.mx/?s=mujer"
    articles = extract_titles_and_links(base_url, max_pages=10)  # Adjust number of pages as needed
    save_to_csv(articles)
    print("Scraping completed and data saved to '8c_titles_mujer.csv'.")


Fetching: https://8columnas.com.mx/?s=mujer
Fetching: https://8columnas.com.mx/page/2/?s=mujer
Fetching: https://8columnas.com.mx/page/3/?s=mujer
Fetching: https://8columnas.com.mx/page/4/?s=mujer
Fetching: https://8columnas.com.mx/page/5/?s=mujer
Fetching: https://8columnas.com.mx/page/6/?s=mujer
Fetching: https://8columnas.com.mx/page/7/?s=mujer
Fetching: https://8columnas.com.mx/page/8/?s=mujer
Fetching: https://8columnas.com.mx/page/9/?s=mujer
Fetching: https://8columnas.com.mx/page/10/?s=mujer
Scraping completed and data saved to '8c_titles_mujer.csv'.


In [57]:
## Trying with more keywords

import requests
from bs4 import BeautifulSoup
import csv

def extract_titles_and_links(base_url, keyword, max_pages=10):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    }
    results = []

    # Iterate through pages
    for page_number in range(1, max_pages + 1):
        # Construct URL for each page
        if page_number == 1:
            url = f"{base_url}?s={keyword}"
        else:
            url = f"{base_url}/page/{page_number}/?s={keyword}"
        
        print(f"Fetching: {url}")
        response = requests.get(url, headers=headers)
        
        # Check for response errors
        if response.status_code != 200:
            print(f"Failed to fetch page: {url}, Status Code: {response.status_code}")
            continue
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("h3", class_="entry-title")
        
        # Extract titles and links
        for article in articles:
            title_tag = article.find("a", href=True)
            if title_tag:
                title = title_tag.get_text(strip=True)
                link = title_tag["href"]
                results.append({"link": link, "title": title, "keyword": keyword})
    
    return results

# Save data to CSV
def save_to_csv(data, filename="8c_titles.csv"):
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["link", "title", "keyword"])
        writer.writeheader()
        writer.writerows(data)

# Main Function
if __name__ == "__main__":
    # Base URL and keywords
    base_url = "https://8columnas.com.mx"
    keywords = ["mujer", "niña", "feminicidio", "violencia+de+género"] 
    all_results = []

    for keyword in keywords:
        print(f"Fetching articles for keyword: {keyword}")
        results = extract_titles_and_links(base_url, keyword, max_pages=10)
        all_results.extend(results)  # Combine results for all keywords
    
    # Save all results into a single CSV
    save_to_csv(all_results, filename="8c_titles.csv")
    print("Scraping completed and data saved to '8c_titles.csv'.")

Fetching articles for keyword: mujer
Fetching: https://8columnas.com.mx?s=mujer
Fetching: https://8columnas.com.mx/page/2/?s=mujer
Fetching: https://8columnas.com.mx/page/3/?s=mujer
Fetching: https://8columnas.com.mx/page/4/?s=mujer
Fetching: https://8columnas.com.mx/page/5/?s=mujer
Fetching: https://8columnas.com.mx/page/6/?s=mujer
Fetching: https://8columnas.com.mx/page/7/?s=mujer
Fetching: https://8columnas.com.mx/page/8/?s=mujer
Fetching: https://8columnas.com.mx/page/9/?s=mujer
Fetching: https://8columnas.com.mx/page/10/?s=mujer
Fetching articles for keyword: niña
Fetching: https://8columnas.com.mx?s=niña
Fetching: https://8columnas.com.mx/page/2/?s=niña
Fetching: https://8columnas.com.mx/page/3/?s=niña
Fetching: https://8columnas.com.mx/page/4/?s=niña
Fetching: https://8columnas.com.mx/page/5/?s=niña
Fetching: https://8columnas.com.mx/page/6/?s=niña
Fetching: https://8columnas.com.mx/page/7/?s=niña
Fetching: https://8columnas.com.mx/page/8/?s=niña
Fetching: https://8columnas.com.

In [58]:
### Removing "Top read news" that also got fetched articles

import requests
from bs4 import BeautifulSoup
import csv

def extract_titles_and_links(base_url, keyword, max_pages=10):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    }
    results = []

    # Iterate through pages
    for page_number in range(1, max_pages + 1):
        # Construct URL for each page
        if page_number == 1:
            url = f"{base_url}?s={keyword}"
        else:
            url = f"{base_url}/page/{page_number}/?s={keyword}"
        
        print(f"Fetching: {url}")
        response = requests.get(url, headers=headers)
        
        # Check for response errors
        if response.status_code != 200:
            print(f"Failed to fetch page: {url}, Status Code: {response.status_code}")
            continue
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("h3", class_="entry-title")
        
        # Extract titles and links
        for article in articles:
            title_tag = article.find("a", href=True)
            if title_tag:
                title = title_tag.get_text(strip=True)
                link = title_tag["href"]
                results.append({"link": link, "title": title, "keyword": keyword})
    
    return results

# Save data to CSV
def save_to_csv(data, filename="8c_titles.csv"):
    # To remove titles that appear on the side as "most popular"
    excluded_titles = [
        "Rinde protesta como presidente de Almoloya de Juárez, Adolfo Solís",
        "Se mantiene en Edoméx tendencia a la baja en homicidio doloso",
        "Petro venderá la residencia del embajador en México para mejorar servicios consulares",
        "TikTok enfrenta investigación de la UE por presunta interferencia en elecciones presidenciales de Rumanía"
    ]

    # Filter out excluded titles
    filtered_data = [row for row in data if row["title"] not in excluded_titles]

    # Write the filtered data to CSV
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["link", "title", "keyword"])
        writer.writeheader()
        writer.writerows(filtered_data)

# Main Function
if __name__ == "__main__":
    # Base URL and keywords
    base_url = "https://8columnas.com.mx"
    keywords = ["mujer", "niña", "feminicidio", "violencia+de+género"]
    all_results = []

    for keyword in keywords:
        print(f"Fetching articles for keyword: {keyword}")
        results = extract_titles_and_links(base_url, keyword, max_pages=10)
        all_results.extend(results)  # Combine results for all keywords
    
    # Save filtered results to CSV
    save_to_csv(all_results, filename="8c_titles.csv")
    print("Scraping completed, filtered data saved to '8c_titles.csv'.")

Fetching articles for keyword: mujer
Fetching: https://8columnas.com.mx?s=mujer
Fetching: https://8columnas.com.mx/page/2/?s=mujer
Fetching: https://8columnas.com.mx/page/3/?s=mujer
Fetching: https://8columnas.com.mx/page/4/?s=mujer
Fetching: https://8columnas.com.mx/page/5/?s=mujer
Fetching: https://8columnas.com.mx/page/6/?s=mujer
Fetching: https://8columnas.com.mx/page/7/?s=mujer
Fetching: https://8columnas.com.mx/page/8/?s=mujer
Fetching: https://8columnas.com.mx/page/9/?s=mujer
Fetching: https://8columnas.com.mx/page/10/?s=mujer
Fetching articles for keyword: niña
Fetching: https://8columnas.com.mx?s=niña
Fetching: https://8columnas.com.mx/page/2/?s=niña
Fetching: https://8columnas.com.mx/page/3/?s=niña
Fetching: https://8columnas.com.mx/page/4/?s=niña
Fetching: https://8columnas.com.mx/page/5/?s=niña
Fetching: https://8columnas.com.mx/page/6/?s=niña
Fetching: https://8columnas.com.mx/page/7/?s=niña
Fetching: https://8columnas.com.mx/page/8/?s=niña
Fetching: https://8columnas.com.

In [92]:
# Read the existing CSV file
df = pd.read_csv('8c_titles.csv')

# Add the new 'observations' column with the value 'edo' for all rows
df['state'] = 'edo'

# Save the updated DataFrame to the same CSV file
df.to_csv('8c_titles.csv', index=False)

print("Added 'state' column with value 'edo' to the CSV.")

Added 'state' column with value 'edo' to the CSV.


# La Opción de Chihuahua

## Key word: "mujer"

In [75]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

In [None]:
# Function to fetch titles
def fetch_titles_with_selenium(base_url, keyword, max_pages=10):
    # Set up WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Construct the initial URL with the keyword
    url = f"{base_url}?q={keyword}&cx=&ie=UTF-8"
    print(f"Fetching: {url}")
    driver.get(url)
    
    titles = []
    current_page = 1

    while current_page <= max_pages:
        try:
            # Wait for the page navigation and content to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.gs-title"))
            )
            time.sleep(2)  # Additional wait for safety
            
            # Find all title elements on the current page
            articles = driver.find_elements(By.CSS_SELECTOR, "a.gs-title")
            for article in articles:
                title = article.text
                link = article.get_attribute("href")
                if title and link:  # Ensure both title and link are valid
                    titles.append({"link": link, "title": title, "keyword": keyword, "state": "chih"})
            
            print(f"Fetched titles from page {current_page}.")
            
            # Try to find the next page button dynamically
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, f"div.gsc-cursor-page[aria-label='Página {current_page + 1}']")
                ActionChains(driver).move_to_element(next_button).click().perform()
                current_page += 1
                time.sleep(2)  # Allow time for the next page to load
            except Exception:
                print(f"No more pages available after page {current_page}.")
                break
        
        except Exception as e:
            print(f"Error during scraping on page {current_page}: {e}")
            break
    
    driver.quit()
    return titles

# Save data to CSV
def save_to_csv(data, filename="titles.csv"):
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["link", "title", "keyword", "state"])
        writer.writeheader()
        writer.writerows(data)

# Successfully save the 100 notes for the keyword "mujer"
if __name__ == "__main__":
    base_url = "https://laopcion.com.mx/buscar.html"
    keyword = "mujer"  # Replace with the desired keyword
    max_pages = 10
    titles = fetch_titles_with_selenium(base_url, keyword, max_pages)
    
    if titles:
        print(f"Fetched {len(titles)} titles.")
        save_to_csv(titles, f"{keyword}_titles.csv")
        print(f"Data saved to '{keyword}_titles.csv'.")
    else:
        print("No titles were fetched.")

## Key words: "niña" and "Violencia de género"

In [85]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from fake_useragent import UserAgent

In [90]:
# Function to fetch titles
def fetch_titles_with_selenium(base_url, keyword, max_pages=10):
    # Set up WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Construct the initial URL with the keyword
    url = f"{base_url}?q={keyword}&cx=&ie=UTF-8"
    print(f"Fetching: {url}")
    driver.get(url)
    
    titles = []
    current_page = 1

    while current_page <= max_pages:
        try:
            # Wait for the page navigation and content to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.gs-title"))
            )
            time.sleep(2)  # Additional wait for safety
            
            # Find all title elements on the current page
            articles = driver.find_elements(By.CSS_SELECTOR, "a.gs-title")
            for article in articles:
                title = article.text
                link = article.get_attribute("href")
                if title and link:  # Ensure both title and link are valid
                    titles.append({"link": link, "title": title, "keyword": keyword, "state": "chih"})
            
            print(f"Fetched titles from page {current_page}.")
            
            # Try to find the next page button dynamically
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, f"div.gsc-cursor-page[aria-label='Página {current_page + 1}']")
                ActionChains(driver).move_to_element(next_button).click().perform()
                current_page += 1
                time.sleep(2)  # Allow time for the next page to load
            except Exception:
                print(f"No more pages available after page {current_page}.")
                break
        
        except Exception as e:
            print(f"Error during scraping on page {current_page}: {e}")
            break
    
    driver.quit()
    return titles

# Save data to CSV
def save_to_csv(data, filename="titles.csv"):
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["link", "title", "keyword", "state"])
        writer.writeheader()
        writer.writerows(data)

# Main function to fetch for multiple keywords
if __name__ == "__main__":
    base_url = "https://laopcion.com.mx/buscar.html"
    keywords = ["niña", "violencia+de+género"]  # The desired keywords
    max_pages = 10
    
    all_titles = []
    
    for keyword in keywords:
        print(f"Fetching titles for keyword: {keyword}")
        titles = fetch_titles_with_selenium(base_url, keyword, max_pages)
        
        if titles:
            print(f"Fetched {len(titles)} titles for keyword '{keyword}'.")
            all_titles.extend(titles)
        else:
            print(f"No titles found for keyword '{keyword}'.")
    
    # Save all fetched titles to CSV
    if all_titles:
        save_to_csv(all_titles, "niña_vdg_titles.csv")
        print("Data saved to 'niña_vdg_titles.csv'.")
    else:
        print("No data to save.")

Fetching titles for keyword: niña
Fetching: https://laopcion.com.mx/buscar.html?q=niña&cx=&ie=UTF-8
Fetched titles from page 1.
Fetched titles from page 2.
Fetched titles from page 3.
Fetched titles from page 4.
Fetched titles from page 5.
Fetched titles from page 6.
Fetched titles from page 7.
Fetched titles from page 8.
Fetched titles from page 9.
Fetched titles from page 10.
No more pages available after page 10.
Fetched 100 titles for keyword 'niña'.
Fetching titles for keyword: violencia+de+género
Fetching: https://laopcion.com.mx/buscar.html?q=violencia+de+género&cx=&ie=UTF-8
Fetched titles from page 1.
Fetched titles from page 2.
Fetched titles from page 3.
Fetched titles from page 4.
Fetched titles from page 5.
Fetched titles from page 6.
Fetched titles from page 7.
Fetched titles from page 8.
Fetched titles from page 9.
Fetched titles from page 10.
No more pages available after page 10.
Fetched 100 titles for keyword 'violencia+de+género'.
Data saved to 'niña_vdg_titles.csv'.


## Keyword: "feminicidio"

In [91]:
# Function to fetch titles
def fetch_titles_with_selenium(base_url, keyword, max_pages=10):
    # Set up WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Construct the initial URL with the keyword
    url = f"{base_url}?q={keyword}&cx=&ie=UTF-8"
    print(f"Fetching: {url}")
    driver.get(url)
    
    titles = []
    current_page = 1

    while current_page <= max_pages:
        try:
            # Wait for the page navigation and content to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.gs-title"))
            )
            time.sleep(2)  # Additional wait for safety
            
            # Find all title elements on the current page
            articles = driver.find_elements(By.CSS_SELECTOR, "a.gs-title")
            for article in articles:
                title = article.text
                link = article.get_attribute("href")
                if title and link:  # Ensure both title and link are valid
                    titles.append({"link": link, "title": title, "keyword": keyword, "state": "chih"})
            
            print(f"Fetched titles from page {current_page}.")
            
            # Try to find the next page button dynamically
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, f"div.gsc-cursor-page[aria-label='Página {current_page + 1}']")
                ActionChains(driver).move_to_element(next_button).click().perform()
                current_page += 1
                time.sleep(2)  # Allow time for the next page to load
            except Exception:
                print(f"No more pages available after page {current_page}.")
                break
        
        except Exception as e:
            print(f"Error during scraping on page {current_page}: {e}")
            break
    
    driver.quit()
    return titles

# Save data to CSV
def save_to_csv(data, filename="titles.csv"):
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["link", "title", "keyword", "state"])
        writer.writeheader()
        writer.writerows(data)

# Main function to scrape "feminicidio" titles
if __name__ == "__main__":
    base_url = "https://laopcion.com.mx/buscar.html"
    keyword = "feminicidio"  # Fetch for "feminicidio"
    max_pages = 10
    titles = fetch_titles_with_selenium(base_url, keyword, max_pages)
    
    if titles:
        print(f"Fetched {len(titles)} titles.")
        save_to_csv(titles, f"{keyword}_titles.csv")
        print(f"Data saved to '{keyword}_titles.csv'.")
    else:
        print("No titles were fetched.")

Fetching: https://laopcion.com.mx/buscar.html?q=feminicidio&cx=&ie=UTF-8
Fetched titles from page 1.
Fetched titles from page 2.
Fetched titles from page 3.
Fetched titles from page 4.
Fetched titles from page 5.
Fetched titles from page 6.
Fetched titles from page 7.
Fetched titles from page 8.
Fetched titles from page 9.
Fetched titles from page 10.
No more pages available after page 10.
Fetched 100 titles.
Data saved to 'feminicidio_titles.csv'.


## Merging the new titles from "8 Columnas" and "La Opción de Chihuahua"

In [93]:
import pandas as pd

# List of CSV files to combine
csv_files = ["feminicidio_titles.csv", "mujer_titles.csv", "niña_vdg_titles.csv"]

# List to store DataFrames
dfs = []

# Read and append each CSV file into the list of DataFrames
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv("loc_titles.csv", index=False)

print("All titles have been combined into 'loc_titles.csv'.")

All titles have been combined into 'loc_titles.csv'.
