| **Author**          | **Roll No**   | **Version** |
|---------------------|----------------|--------------|
| Vinayak Rana        | 24210114       | 1.0          |

#### Introduction
In this notebook we will scrap lecture transcripts from Nptel official website (https://nptel.ac.in/courses).

In [1]:
import requests
import pdfplumber
import os

In [2]:
def download_file_from_google_drive(file_id, destination='Data/Nptel_data/pdfs'):

    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
    response = requests.get(download_url)
    
    if response.status_code == 200:
        with open(destination, 'wb') as file:
            file.write(response.content)
        print(f"File downloaded successfully: {destination}")
    else:
        print(f'Failure for link {download_url}')

def extract_text_with_pdfplumber(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        return text

In [3]:
def download_book(file_id, destination_folder, file_name):
 
    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
    
    response = requests.get(download_url)
    if response.status_code == 200:
        destination = os.path.join(destination_folder, f"{file_name}.pdf")
        
        with open(destination, 'wb') as file:
            file.write(response.content)
        
        print(f"File downloaded successfully: {destination}")
        return destination 
    else:
        print(f"Failed to download file from {download_url}")
        return None

def extract_text(pdf_file):
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
            return text
    except:
        print('Corrupt or wrong pdf')
        return ""

def book_scrap(file_ids, destination_folder='Data/Nptel_data/'):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    
    for i, file_id in enumerate(file_ids):
        file_name = str(i+1004)
        
        pdf_path = download_book(file_id, destination_folder+'Pdfs', file_name)
        
        if pdf_path:
            extracted_text = extract_text(pdf_path)
            
            txt_file_path = os.path.join(destination_folder+'Pdf text', f"{file_name}.txt")
            with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                txt_file.write(extracted_text)
            
            print(f"Text extracted and saved to: {txt_file_path}")



In [4]:
def find_id():
    file_id = []
    with open('Data/Nptel_data/links.txt','r') as f:
        for line in f:
            link = line.strip()
            if link:
                file_id.append(link[32:-5])
    return file_id

In [5]:
file_ids = find_id()

In [6]:
book_scrap(file_ids[1004:])

File downloaded successfully: Data/Nptel_data/Pdfs/1004.pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1004.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1005.pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1005.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1006.pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1006.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1007.pdf
Corrupt or wrong pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1007.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1008.pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1008.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1009.pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1009.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1010.pdf
Text extracted and saved to: Data/Nptel_data/Pdf text/1010.txt
File downloaded successfully: Data/Nptel_data/Pdfs/1011.pdf
Corrupt or wrong pdf
Text extracted and saved to: Data/Npt

In [7]:
372,373,374

(372, 373, 374)