Import Libraries

In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os

Data Scrapping : Source - indiabix

In [27]:
def retrieve_content(url):
    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        questions = soup.find_all('div', class_='bix-div-container')
        data = []
        
        for question in questions:
            q_number = question.find('div', class_='bix-td-qno').text.strip()
            q_text = question.find('div', class_='bix-td-qtxt').text.strip()
            options_div = question.find('div', class_='bix-tbl-options')
            
            options = []
            if options_div:
                for option in options_div.find_all('div', class_='bix-opt-row'):
                    option_text = option.text.strip()
                    if option_text:
                        options.append(option_text)
            
            answer = question.find('input', class_='jq-hdnakq')['value']
            explanation = question.find('div', class_='bix-div-answer').text.strip() if question.find('div', class_='bix-div-answer') else 'No explanation provided'
            
            data.append({
                'Question Number': q_number,
                'Question Text': q_text,
                'Options': '; '.join(options),
                'Answer': answer,
                'Explanation': explanation
            })
        
        return data
    except requests.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return []

In [28]:
def create_or_append_xls_file(data, filename):
    try:
        if os.path.exists(filename):
            df_existing = pd.read_excel(filename)
            df_new = pd.DataFrame(data)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
            df_combined.to_excel(filename, index=False)
        else:
            df_new = pd.DataFrame(data)
            df_new.to_excel(filename, index=False)
        
        #print(f"Excel file '{filename}' updated successfully.")
    except Exception as e:
        print(f"Error creating or appending Excel file '{filename}': {e}")

In [29]:
def handle_pages(base_url, total_pages, folder_name):
    # Create the new directory if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        #print(f"Created directory: {folder_name}")
    
    for page_number in range(1, total_pages + 1):
        # Adjust the page URL for the first page or subsequent pages
        if page_number == 1:
            page_url = base_url
        else:
            page_url = f"{base_url}/{str(page_number).zfill(6)}"
        
        #print(f"Processing page: {page_url}")
        
        # Retrieve content from the page
        data = retrieve_content(page_url)
        
        if data:
            # Construct the file name using the folder path and base URL
            file_name = os.path.join(folder_name, base_url.split('/')[-2] + '.xlsx')
            
            # Create the file or append to it
            create_or_append_xls_file(data, file_name)
        else:
            print(f"No content retrieved for {page_url}")

    print(f"All files saved in the directory: {folder_name}")

In [30]:
def find_folders(folder_ls, url):
    try:
        # Fetch the content from the URL
        response = requests.get(url, verify=False)  # Set verify=False to ignore SSL warnings
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all folder links
        folders = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if f'/{folder_ls}/' in href and href not in folders:
                folders.append(href)
        
        return folders

    except requests.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return []

In [31]:
def process_all_folders_and_subfolders(url, folder_name, total_pages=7):
    # Find and process all folders
    folders = find_folders(folder_name, url)

    for folder in folders:
        print(f"Processing folder: {folder}")
        
        # Find subfolders within the current folder
        subfolders = find_folders(folder_name, folder)
        
        # If no subfolders are found, handle just the main folder
        if not subfolders:
            handle_pages(folder, total_pages, folder_name)
        else:
            # Process each subfolder individually
            for subfolder in subfolders:
                print(f"Processing subfolder: {subfolder}")
                handle_pages(subfolder, total_pages, folder_name)

        # After processing all subfolders of the current folder, move to the next folder
        print(f"Finished processing folder: {folder}")
        break

    print("All folders and subfolders processed successfully.")


All files --> 1 File per Topic

In [32]:
def check_for_duplicates(file_path):
    """Check for duplicate entries in the Excel file based on 'Question Text'."""
    try:
        # Read the data from the Excel file
        df = pd.read_excel(file_path)

        # Check for duplicate questions
        if 'Question Text' not in df.columns:
            print("The expected column 'Question Text' is missing from the file.")
            return

        # Identify duplicates
        duplicates = df[df.duplicated(subset=['Question Text'], keep=False)]

        if duplicates.empty:
            print("No duplicate questions found.")
        else:
            print(f"Found {len(duplicates)} duplicate entries.")
            print(duplicates[['Question Number', 'Question Text']])
            # Optionally, save duplicates to a separate file
            duplicates.to_excel('duplicates_found.xlsx', index=False)
            print("Duplicate entries have been saved to 'duplicates_found.xlsx'.")

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

In [33]:
def remove_duplicates_from_file(file_path):
    """Read an .xlsx file, remove duplicate rows, and save it back."""
    try:
        df = pd.read_excel(file_path)
        
        # Remove duplicate rows based on 'Question Number' and 'Question Text'
        df_cleaned = df.drop_duplicates(subset=['Question Number', 'Question Text'])
        
        # Save the cleaned DataFrame back to the file
        df_cleaned.to_excel(file_path, index=False)
        print(f"Duplicates removed and file saved: {file_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

In [34]:
def process_all_xlsx_files(folder_path):
    """Process all .xlsx files in the specified folder."""
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            remove_duplicates_from_file(file_path)

In [35]:
def search_xlsx_files(directory):
    """Search for all .xlsx files in the given directory, excluding temporary files."""
    xlsx_files = [os.path.join(directory, f) for f in os.listdir(directory)
                  if f.endswith('.xlsx') and not f.startswith('~$')]
    return xlsx_files

In [36]:
def read_data_from_file(file_path):
    """Read data from an .xlsx file and return it as a DataFrame."""
    try:
        df = pd.read_excel(file_path)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return pd.DataFrame()  # Return an empty DataFrame
    except PermissionError:
        print(f"Permission denied: {file_path}")
        return pd.DataFrame()  # Return an empty DataFrame

In [37]:
def combine_data(xlsx_files, output_file):
    all_data = []
    unique_questions = set()  # To keep track of unique questions

    for file_path in xlsx_files:
        folder_name = os.path.splitext(os.path.basename(file_path))[0]
        df = read_data_from_file(file_path)
        
        for index, row in df.iterrows():
            question = row.get('Question Text', '')
            if question and question not in unique_questions:
                unique_questions.add(question)
                all_data.append({
                    'Question Number': row.get('Question Number', ''),
                    'Question Text': question,
                    'Options': row.get('Options', ''),
                    'Answer': row.get('Answer', ''),
                    'Explanation': row.get('Explanation', ''),
                    'Folder Name': folder_name
                })
    
    # Create a DataFrame from the consolidated data
    consolidated_df = pd.DataFrame(all_data)

    # Write the DataFrame to an .xlsx file using openpyxl
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        consolidated_df.to_excel(writer, index=False, sheet_name='Consolidated Data')

In [38]:
def count_questions_per_topic(file_path):
    """Count the number of questions per topic and the total number of questions in the Excel file."""
    try:
        # Read the data from the Excel file
        df = pd.read_excel(file_path)

        # Check if the required columns exist
        if 'Folder Name' not in df.columns:
            print("The expected column 'Folder Name' is missing from the file.")
            return

        # Group by 'Folder Name' and count the number of questions
        topic_counts = df.groupby('Folder Name').size().reset_index(name='Question Count')

        # Print the number of questions per topic
        print("Number of questions per topic:")
        print(topic_counts)

        # Calculate total number of questions
        total_questions = df.shape[0]

        # Print the total number of questions
        print(f"\nTotal number of questions: {total_questions}")

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


Implementation : Scraping + File Generation

General Aptitude

In [39]:
'''
folder_list = ["aptitude", "data-interpretation", "verbal-ability", "logical-reasoning", "verbal-reasoning", "non-verbal-reasoning"]

for folder in folder_list:
    url = f'https://www.indiabix.com/{folder}/questions-and-answers/'
    folder_name = folder  # Name of the directory to save all files
    folder_path = "./{folder_name}"

    # Call the function to process folders and subfolders
    process_all_folders_and_subfolders(url, folder_name)
'''

'\nfolder_list = ["aptitude", "data-interpretation", "verbal-ability", "logical-reasoning", "verbal-reasoning", "non-verbal-reasoning"]\n\nfor folder in folder_list:\n    url = f\'https://www.indiabix.com/{folder}/questions-and-answers/\'\n    folder_name = folder  # Name of the directory to save all files\n    folder_path = "./{folder_name}"\n\n    # Call the function to process folders and subfolders\n    process_all_folders_and_subfolders(url, folder_name)\n'

In [40]:
'''
folder_list = ["aptitude", "data-interpretation", "verbal-ability", "logical-reasoning", "verbal-reasoning", "non-verbal-reasoning"]

for folder in folder_list:
    
    # Directory containing .xlsx files
    directory = f'./{folder}/'  # Correctly format the directory path

    # Search for .xlsx files
    xlsx_files = search_xlsx_files(directory)

    # Output .xlsx file
    output_file = f'{folder}.xlsx'  # Correctly format the output file name

    # Consolidate data and write to the output file
    combine_data(xlsx_files, output_file)

    # Path to the consolidated Excel file
    file_path = output_file

    # Count questions per topic in the file
    count_questions_per_topic(file_path)
'''

'\nfolder_list = ["aptitude", "data-interpretation", "verbal-ability", "logical-reasoning", "verbal-reasoning", "non-verbal-reasoning"]\n\nfor folder in folder_list:\n    \n    # Directory containing .xlsx files\n    directory = f\'./{folder}/\'  # Correctly format the directory path\n\n    # Search for .xlsx files\n    xlsx_files = search_xlsx_files(directory)\n\n    # Output .xlsx file\n    output_file = f\'{folder}.xlsx\'  # Correctly format the output file name\n\n    # Consolidate data and write to the output file\n    combine_data(xlsx_files, output_file)\n\n    # Path to the consolidated Excel file\n    file_path = output_file\n\n    # Count questions per topic in the file\n    count_questions_per_topic(file_path)\n'

Technical

In [41]:
'''
folder_list = ["c-programming", "cpp-programming", "c-sharp-programming", "java-programming"]

for folder in folder_list:
    url = f'https://www.indiabix.com/{folder}/questions-and-answers/'
    folder_name = folder  # Name of the directory to save all files
    folder_path = "./{folder_name}"

    # Call the function to process folders and subfolders
    process_all_folders_and_subfolders(url, folder_name)

    directory = f'./{folder}/'  # Correctly format the directory path

    # Search for .xlsx files
    xlsx_files = search_xlsx_files(directory)

    # Output .xlsx file
    output_file = f'{folder}.xlsx'  # Correctly format the output file name

    # Consolidate data and write to the output file
    combine_data(xlsx_files, output_file)

    # Path to the consolidated Excel file
    file_path = output_file

    # Count questions per topic in the file
    count_questions_per_topic(file_path)
'''

'\nfolder_list = ["c-programming", "cpp-programming", "c-sharp-programming", "java-programming"]\n\nfor folder in folder_list:\n    url = f\'https://www.indiabix.com/{folder}/questions-and-answers/\'\n    folder_name = folder  # Name of the directory to save all files\n    folder_path = "./{folder_name}"\n\n    # Call the function to process folders and subfolders\n    process_all_folders_and_subfolders(url, folder_name)\n\n    directory = f\'./{folder}/\'  # Correctly format the directory path\n\n    # Search for .xlsx files\n    xlsx_files = search_xlsx_files(directory)\n\n    # Output .xlsx file\n    output_file = f\'{folder}.xlsx\'  # Correctly format the output file name\n\n    # Consolidate data and write to the output file\n    combine_data(xlsx_files, output_file)\n\n    # Path to the consolidated Excel file\n    file_path = output_file\n\n    # Count questions per topic in the file\n    count_questions_per_topic(file_path)\n'

Counting Questions per Topic in Storage

In [42]:
def find_last_non_empty_row(file_path):
    """Find the last non-empty row in an .xlsx file."""
    try:
        df = pd.read_excel(file_path)
        if not df.empty:
            # Find the last non-empty row index
            last_row_index = df.last_valid_index()
            return last_row_index
        else:
            return None
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

In [43]:

def process_all_xlsx_files_in_folder(folder_path):
    """Process all .xlsx files in the given folder and find the last non-empty row."""
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            last_row_index = find_last_non_empty_row(file_path)
            if last_row_index is not None:
                print(f"{file_name} - No of Questions : {last_row_index-1}")
            else:
                print(f"File: {file_name} is empty or could not be processed.")

In [44]:
'''
# Path to the current folder containing .xlsx files
current_folder = '.'  # Replace 'your_folder_name' with your folder path

# Process all .xlsx files in the current folder
process_all_xlsx_files_in_folder(current_folder)
'''

"\n# Path to the current folder containing .xlsx files\ncurrent_folder = '.'  # Replace 'your_folder_name' with your folder path\n\n# Process all .xlsx files in the current folder\nprocess_all_xlsx_files_in_folder(current_folder)\n"

Sections

In [45]:
# Define the general and technical categories
general_categories = [
    "aptitude", 
    "data-interpretation", 
    "verbal-ability", 
    "logical-reasoning", 
    "verbal-reasoning"
]

technical_categories = [
    "c-programming", 
    "cpp-programming", 
    "c-sharp-programming", 
    "java-programming"
]

Random Questions Generation

In [46]:
# Function to get random questions from a specific category
def get_random_questions(category, num_questions, folder_name):
    file_path = f"{folder_name}/{category}.xlsx"
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return pd.DataFrame()

    df = pd.read_excel(file_path)
    
    # Shuffle and select a specific number of questions
    return df.sample(n=num_questions)

In [47]:
# Function to format and display explanation
def format_explanation(explanation):
    # Remove extra spaces and newlines
    return ' '.join(explanation.split())

In [48]:
# Function to ask questions and track score
def ask_questions(questions, category):
    score = 0
    
    for i, (index, row) in enumerate(questions.iterrows(), start=1):
        print(f"\n--- {category.capitalize()} Q{i} ---")
        print(f"\n Question: {row['Question Text']}")
        
        options = row['Options'].split('\n')  # Assuming options are separated by newlines
        for idx, option in enumerate(options, start=1):
            print(f"{option}")
        
        answer = input("Your answer (A, B, C, D): ").strip().upper()
        correct_answer = row['Answer'].strip().upper()
        
        if answer == correct_answer:
            print("Correct!")
            score += 1
        else:
            print(f"\n Incorrect. The correct answer was {correct_answer}.")
        
        if pd.notna(row['Explanation']):
            formatted_explanation = format_explanation(row['Explanation'])
            print(f"\nExplanation: {formatted_explanation}")

    return score

Take A Quize...!

In [49]:
# Function to conduct the quiz
def conduct_quiz(category_list, num_questions_per_category, folder_name):
    total_score = 0
    total_questions = 0

    for category in category_list:
        print(f"\n--- Category: {category.capitalize()} ---")
        questions = get_random_questions(category, num_questions_per_category, folder_name)
        if questions.empty:
            continue

        score = ask_questions(questions, category)
        total_score += score
        total_questions += len(questions)

    print(f"\nQuiz finished! Your total score is {total_score}/{total_questions}.")

In [50]:
# Main execution function
category_set = input("Lets start the Apptitude Quiz :(technical/general) ").strip().lower()
num_questions = int(input("How many questions per category? "))
    
folder_name = "."  # Update this to the correct folder path

if category_set == 'general':
    conduct_quiz(general_categories, num_questions, folder_name)
elif category_set=='technical':
    conduct_quiz(technical_categories,num_questions, folder_name)
else:
    print("Invalid category set. Please choose 'general' or 'technical'.")


--- Category: Aptitude ---

--- Aptitude Q1 ---

 Question: Two stations A and B are 110 km apart on a straight line. One train starts from A at 7 a.m. and travels towards B at 20 kmph. Another train starts from B at 8 a.m. and travels towards A at a speed of 25 kmph. At what time will they meet?
9 a.m.; 10 a.m.; 10.30 a.m.; 11 a.m.

 Incorrect. The correct answer was B.

Explanation: Answer: Option Explanation: Suppose they meet x hours after 7 a.m. Distance covered by A in x hours = 20x km. Distance covered by B in (x - 1) hours = 25(x - 1) km. 20x + 25(x - 1) = 110 45x = 135 x = 3. So, they meet at 10 a.m.

--- Category: Data-interpretation ---

--- Data-interpretation Q1 ---

 Question: Average annual exports during the given period for Company Y is approximately what percent of the average annual exports for Company Z?
87.12%; 89.64%; 91.21%; 93.33%
