In [62]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import pandas as pd

# Open the webpage with Selenium (Safari driver)
driver = webdriver.Safari()

columnist_name = 'bercan-tutar'

# The URL of the webpage where the articles are listed
url = f'https://www.sabah.com.tr/yazarlar/{columnist_name}/arsiv/getall'
driver.get(url)

# Scroll down until all articles are loaded
SCROLL_PAUSE_TIME = 2
last_height = driver.execute_script("return document.body.scrollHeight")

MAX_SCROLLS = 500

for scroll in range(MAX_SCROLLS):
    # Scroll down to the bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait for the new articles to load
    time.sleep(SCROLL_PAUSE_TIME)
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get the page source after all articles are loaded
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the browser
driver.quit()

# Create lists to store article titles and dates
article_titles = []
article_dates = []

# Find all div elements that contain article information (based on your screenshot)
for article in soup.find_all('div', class_='col-sm-12'):
    # Extract the title from the 'strong' tag with the class 'postCaption'
    title_tag = article.find('strong', class_='postCaption')
    if title_tag:
        title = title_tag.text.strip()
        article_titles.append(title)
    
    # Extract the date from the 'span' tag with the class 'postTime'
    date_tag = article.find('span', class_='postTime')
    if date_tag:
        date = date_tag.text.strip()
        article_dates.append(date)

# Create a DataFrame to organize the data
df = pd.DataFrame({
    'Article Title': article_titles,
    'Date': article_dates
})

# Save the data to a CSV file
df.to_csv(f'../columnist_data/full_articles/{columnist_name}_articles.csv', index=False)
print("CSV file saved successfully.")


CSV file saved successfully.


In [63]:
import requests
import re

# Extended dictionary for replacing Turkish characters and accented characters with English equivalents
turkish_to_english_map = str.maketrans({
    'ı': 'i',
    'ü': 'u',
    'ö': 'o',
    'ç': 'c',
    'ğ': 'g',
    'ş': 's',
    'İ': 'i',
    'Ü': 'u',
    'Ö': 'o',
    'Ç': 'c',
    'Ğ': 'g',
    'Ş': 's',
    'Û': 'u',  # Special case for Û
    'â': 'a',
    'î': 'i',
    'ô': 'o',
    'û': 'u',
    'é': 'e'
})

# Function to generate the article URL
def generate_url(title, date):
    # Convert the title to lowercase, replace Turkish and accented characters, spaces with hyphens, and remove special characters
    title = title.lower().translate(turkish_to_english_map)
    title = re.sub(r'[^a-z0-9\s-]', '', title)  # Remove any remaining special characters
    title = title.replace(' ', '-')  # Replace spaces with hyphens
    
    # Convert the date to numeric format (YYYY/MM/DD)
    date_parts = date.split()
    month_map = {
        'Ocak': '01', 'Şubat': '02', 'Mart': '03', 'Nisan': '04', 'Mayıs': '05', 'Haziran': '06',
        'Temmuz': '07', 'Ağustos': '08', 'Eylül': '09', 'Ekim': '10', 'Kasım': '11', 'Aralık': '12'
    }
    
    day = date_parts[0]
    month = month_map[date_parts[1]]
    year = date_parts[2]
    
    # Construct the URL
    url = f'https://www.sabah.com.tr/yazarlar/{columnist_name}/{year}/{month}/{day}/{title}'
    return url

# Example usage with the rest of your code unchanged
# Load the existing CSV with article titles and dates
df = pd.read_csv(f'../columnist_data/full_articles/{columnist_name}_articles.csv')

# List to store full article content
article_urls = []
article_contents = []

start_from = 0
current = 0
# Iterate over each article in the CSV
for row in df.itertuples():
    current += 1
    if current < start_from:
        pass
    else:
        article_title = row[1]
        article_date = row[2]
        
        # Generate the article URL
        url = generate_url(article_title, article_date)
        article_urls.append(url)
        
        # Send request to the article page
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the article content from the div with class 'newsBox'
            article_body = soup.find('div', class_='newsBox')

            if article_body:
                article_text = article_body.get_text(separator='\n', strip=True)
                article_contents.append(article_text)
                print(f"Successfully retrieved the article: {article_title}")
            else:
                article_contents.append('Content not found')
            
        else:
            print(f"Failed to retrieve the article: {url}")
            article_contents.append('Content not found')

        # Be polite and avoid overwhelming the server by adding a short delay
        time.sleep(0.1)

# Add the URLs and article content to the DataFrame
df['URL'] = article_urls
df['Article Content'] = article_contents

# Save the DataFrame with full article data to a new CSV file
df.to_csv(f'../columnist_data/full_articles/{columnist_name}_full_articles.csv', index=False)
print("Full article content collected and saved to CSV.")


Successfully retrieved the article: İsrail ‘istenmeyen devlet’ ilan edilmeli
Successfully retrieved the article: İsrail ‘istenmeyen devlet’ ilan edilmeli
Successfully retrieved the article: İsrail ‘istenmeyen devlet’ ilan edilmeli
Successfully retrieved the article: Siyonistlere okkalı Osmanlı tokadı
Successfully retrieved the article: ABD’deki ‘İsrail krizi’
Successfully retrieved the article: Batı’nın yeni laneti
Successfully retrieved the article: İsrail’i kara kara düşündüren antisiyonizm dalgası
Successfully retrieved the article: ABD, İsrail’i terk ediyor
Successfully retrieved the article: Netanyahu da Bush’un hezimetiyle karşılaşacak
Successfully retrieved the article: Çin’in ‘İsrail sorunu’
Successfully retrieved the article: İşte İsrail’in bölgesel savaştaki 5 hedefi
Successfully retrieved the article: Rusya taşın altına elini koyacak mı?
Successfully retrieved the article: İsrail’in açmazları
Successfully retrieved the article: Dost kazığı
Successfully retrieved the article:

In [55]:
len(article_contents)
len(article_urls)

360