In [7]:
import re
import requests

# URL of the Indian Constitution text
url = "https://www.india.gov.in/my-government/constitution-india"

# Fetching the content
response = requests.get(url)
constitution_text = response.text

# Preprocessing: Cleaning the raw text
def clean_text(text):
    # Removing unnecessary line breaks, extra spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove non-text elements like page numbers, footnotes
    text = re.sub(r'\[\d+\]', '', text)
    return text

cleaned_constitution = clean_text(constitution_text)

# Tokenization: Split into articles
articles = cleaned_constitution.split("Article")

# Display the first few articles
for i, article in enumerate(articles[:5], 1):
    print(f"Article {i}: {article[:50]}...\n")


Article 1:  <!DOCTYPE html> <!--[if lt IE 7]><html class="lt-...

Article 2:  79 of the Constitution of India, the council of t...

Article 3:  74(1) of the Constitution provides that there sha...



In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Supreme Court of India judgments URL
url = "https://main.sci.gov.in/judgments"
# Function to scrape judgments
def scrape_supreme_court(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # Extract judgment blocks
    judgment_rows = soup.find_all("tr", {"class": ["row_odd", "row_even"]})
    judgments = []
    for row in judgment_rows:
        columns = row.find_all("td")
        date = columns[0].get_text().strip()
        case_name = columns[1].get_text().strip()
        judgment_url = "https://main.sci.gov.in" + columns[2].find("a")["href"]
        # Fetch judgment content
        judgment_response = requests.get(judgment_url)
        judgment_soup = BeautifulSoup(judgment_response.text, "html.parser")
        # Get judgment text
        judgment_text_block = judgment_soup.find("div", class_="panel-body")
        judgment_text = judgment_text_block.get_text(separator=" ").strip() if judgment_text_block else "Not Available"
        # Clean text
        judgment_text = clean_text(judgment_text)  
        judgments.append({
            "Date": date,
            "Case Name": case_name,
            "URL": judgment_url,
            "Content": judgment_text
        })
    return judgments
# Scraping Supreme Court judgments
supreme_court_data = scrape_supreme_court(url)
# Convert to DataFrame
supreme_court_df = pd.DataFrame(supreme_court_data)
# Save to CSV for future analysis
supreme_court_df.to_csv("supreme_court_judgments.csv", index=False)
# Display first few rows
print(supreme_court_df.head())


Empty DataFrame
Columns: []
Index: []


In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download stopwords and tokenizer models
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean the text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove legal references like [1], [2], etc.
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove non-alphabetic characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Function to preprocess the text (clean, tokenize, remove stop words, lemmatize)
def preprocess_text(text):
    # Clean the text
    text = clean_text(text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and apply lemmatization
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]
    
    return ' '.join(tokens)

# Load the scraped data (example using Indian Kanoon cases)
df = pd.read_csv("indian_kanoon_cases.csv")

# Preprocess the 'Content' column of the legal data
df['Preprocessed_Content'] = df['Content'].apply(preprocess_text)

# Save the preprocessed data to a new CSV
df.to_csv("preprocessed_legal_cases.csv", index=False)

# Display first few rows of preprocessed data
print(df[['Title', 'Preprocessed_Content']].head())
