**Downloading Data from the Database with links from Webscraper**

In [ ]:
import pandas as pd
import requests
import os

# Function to download files from the links in the Excel file
def download_files_from_excel(excel_file, output_folder):
    # Read the Excel file
    df = pd.read_excel(excel_file, engine='openpyxl')
    for index, row in df.iterrows():
        file_url = row['Links-href']  # Adjust column name if different
        file_name = os.path.basename(file_url)  # Extract file name from URL
        
        # Create the full path for saving the file
        file_path = os.path.join(output_folder, file_name)
        
        try:
            # Download the file
            response = requests.get(file_url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            
            # Write the file content to disk
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            print(f"Downloaded: {file_name}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {file_name}: {e}")

def add_pdf_extension_to_files(folder_path):
    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file and does not already have an extension
        if os.path.isfile(file_path) and '.' not in filename:
            new_file_path = f"{file_path}.pdf"  # Append .pdf to the filename
            os.rename(file_path, new_file_path)  # Rename the file
            print(f"Renamed: {filename} -> {os.path.basename(new_file_path)}")
        else:
            print(f"Skipped: {filename}")

# Specify the folders containing the files
folder_path = r'C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A350' 
excel_file = r'C:\Users\zdrop\UNI\masters\Thesis_files\AD_Links_320.xlsx'
output_folder = r'C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320'

# Call the function
#download_files_from_excel(excel_file, output_folder)
#add_pdf_extension_to_files(folder_path)

**Removing non-english files**
- put all files in a separate directory

In [None]:
import os
import shutil
from langdetect import detect
import fitz  # PyMuPDF

# === CONFIGURATION ===
source_dir = r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\A320"
non_english_dir = os.path.join(source_dir,r"C:\Users\zdrop\OneDrive - TU Wien\MASTER THESIS\ADs\non_english")
char_threshold = 1000  # Number of characters to use for detection

# Create target folder if it doesn't exist
os.makedirs(non_english_dir, exist_ok=True)

# === FUNCTION TO EXTRACT TEXT FROM PDF ===
def extract_text(pdf_path, max_chars=1000):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
            if len(text) >= max_chars:
                break
        return text[:max_chars]
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === MAIN LOOP ===
for filename in os.listdir(source_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(source_dir, filename)
        sample_text = extract_text(file_path)

        if not sample_text.strip():
            print(f"Skipping (empty or unreadable): {filename}")
            continue

        try:
            lang = detect(sample_text)
            if lang != "en":
                print(f"Detected {lang} – moving: {filename}")
                shutil.move(file_path, os.path.join(non_english_dir, filename))
            else:
                print(f"Detected English – keeping: {filename}")
        except Exception as e:
            print(f"Language detection failed for {filename}: {e}")