In [None]:
import pandas as pd

# Read the merged dataset file
df = pd.read_csv("merged_cutoffs.csv")

# Print basic info and first few rows to check data presence and columns
print("Dataset Info:")
print(df.info())

print("\nSample rows:")
print(df.head())

# Optionally, check for missing values summary
print("\nMissing values per column:")
print(df.isnull().sum())

# Check columns to verify expected fields
print("\nColumns:")
print(df.columns.tolist())


In [None]:
# Capitalize first letter of each column
df.columns = df.columns.str.capitalize()

print(df.columns.tolist())


In [None]:
import pandas as pd

# Read fees dataset
fees = pd.read_csv("fees.csv")
print("Fees Data Sample:")
print(fees.head())

print("\nFees Dataset Info:")
print(fees.info())

# Read placements dataset
placements = pd.read_csv("placements.csv")
print("\nPlacements Data Sample:")
print(placements.head())

print("\nPlacements Dataset Info:")
print(placements.info())


In [None]:
import pandas as pd

# Load your CSVs
cutoffs_df = pd.read_csv("merged_cutoffs.csv")
fees_df = pd.read_csv("fees.csv")
placements_df = pd.read_csv("placements.csv")

# Check the column names exactly
print("Cutoffs columns:", cutoffs_df.columns.tolist())
print("Fees columns:", fees_df.columns.tolist())
print("Placements columns:", placements_df.columns.tolist())


In [None]:
import pandas as pd

# Load original cutoffs
cutoffs_df = pd.read_csv("merged_cutoffs.csv")  # your original file

# Capitalize first letter of each column
cutoffs_df.columns = cutoffs_df.columns.str.capitalize()

# Save as new CSV
cutoffs_df.to_csv("cutoff.csv", index=False)

print("New dataset 'cutoff.csv' created with columns:")
print(cutoffs_df.columns.tolist())


In [None]:
import pandas as pd

# Load datasets
cutoffs_df = pd.read_csv("cutoff.csv")
fees_df = pd.read_csv("fees.csv")
placements_df = pd.read_csv("placements.csv")

# Show samples
print("=== Cutoffs Sample ===")
print(cutoffs_df.head(10))
print("\nColumns:", cutoffs_df.columns.tolist())

print("\n=== Fees Sample ===")
print(fees_df.head(10))
print("\nColumns:", fees_df.columns.tolist())

print("\n=== Placements Sample ===")
print(placements_df.head(10))
print("\nColumns:", placements_df.columns.tolist())


In [None]:
import pandas as pd

# Load datasets
cutoffs_df = pd.read_csv("cutoff.csv")
fees_df = pd.read_csv("fees.csv")
placements_df = pd.read_csv("placements.csv")

# 1️⃣ Filter only 2024 and 2025
cutoffs_df = cutoffs_df[cutoffs_df['Year'].isin([2024, 2025])]
fees_df = fees_df[fees_df['Year'].isin([2024, 2025])]
placements_df = placements_df[placements_df['Year'].isin([2024, 2025])]

# 2️⃣ Standardize strings for merge
for df in [cutoffs_df, fees_df, placements_df]:
    for col in ['College', 'Branch', 'Exam']:
        if col in df.columns:
            df[col] = df[col].str.strip().str.upper()

# 3️⃣ Merge Cutoffs + Fees
merged_cutoffs = pd.merge(
    cutoffs_df,
    fees_df,
    on=["College", "Branch", "Exam", "Year"],
    how="left"
)

# 4️⃣ Merge with Placements (no Exam column)
master_df = pd.merge(
    merged_cutoffs,
    placements_df,
    on=["College", "Branch", "Year"],
    how="left"
)

# 5️⃣ Inspect results
print("Shape after filtering and merging:", master_df.shape)
print(master_df.head(10))


In [None]:
print(master_df.columns.tolist())


In [None]:
import pandas as pd

# Assume master_df is already in memory
# Remove duplicates
master_df = master_df.drop_duplicates(
    subset=['College', 'Branch', 'Year', 'Category', 'Exam'], keep='first'
)

# Check for nulls
print("Null / NaN counts per column:")
print(master_df.isnull().sum())

# Fill numeric NaNs with 0
numeric_cols = ['Tuition_Fee', 'Hostel_Fee', 'Misc_Fee', 'OneTime_Fee',
                'Total_First_Year', 'Total_Annual', 'Avg_Package_LPA', 'Max_Package_LPA', 'NIRF_Rank']
master_df[numeric_cols] = master_df[numeric_cols].fillna(0)

# Fill categorical NaNs with 'Unknown'
categorical_cols = ['Scholarship_Eligible', 'Top_Companies']
master_df[categorical_cols] = master_df[categorical_cols].fillna('Unknown')

# Save final cleaned dataset
master_df.to_csv("final_rag.csv", index=False)
print("Final dataset saved as 'final_rag.csv'. Shape:", master_df.shape)


In [None]:
import pandas as pd

# Load the final cleaned dataset
df = pd.read_csv("final_rag.csv")

# Quick overview
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)
print("\nFirst 10 rows:\n", df.head(10))
print("\nNull / NaN counts per column:\n", df.isnull().sum())


In [None]:
from google.colab import files
files.download("final_rag.csv")
