In [2]:
# ================================
# IPL 2025 DATASET – FULL PREVIEW & DEEP EDA
# Author: Shaik Anas
# ================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("default")
plt.rcParams["figure.figsize"] = (10, 6)

print("="*80)
print("LOADING DATASETS")
print("="*80)

# --- Start Fix ---
import os
from google.colab import files

# Check if files exist, if not, prompt for upload
required_files = ["IPL2025Batters.csv", "IPL2025Bowlers.csv"]
for f in required_files:
    if not os.path.exists(f'/content/{f}'):
        print(f"File '{f}' not found. Please upload it.")
        uploaded = files.upload()
        if f not in uploaded:
            raise FileNotFoundError(f"'{f}' was not uploaded. Please ensure you upload the correct file.")
        print(f"'{f}' uploaded successfully.")
# --- End Fix ---

batting_df = pd.read_csv("/content/IPL2025Batters.csv")
bowling_df = pd.read_csv("/content/IPL2025Bowlers.csv")

print("\nBatting Dataset Loaded")
print("Bowling Dataset Loaded")

# -------------------------------
# BASIC STRUCTURE
# -------------------------------
print("\n" + "="*80)
print("DATASET SHAPE")
print("="*80)
print("Batting Shape:", batting_df.shape)
print("Bowling Shape:", bowling_df.shape)

# -------------------------------
# COLUMN DETAILS
# -------------------------------
print("\n" + "="*80)
print("COLUMN NAMES")
print("="*80)

print("\nBatting Columns:")
for col in batting_df.columns:
    print("-", col)

print("\nBowling Columns:")
for col in bowling_df.columns:
    print("-", col)

# -------------------------------
# HEAD / TAIL / SAMPLE
# -------------------------------
print("\n" + "="*80)
print("DATA PREVIEW")
print("="*80)

print("\nBatting Head:")
display(batting_df.head(10))

print("\nBatting Tail:")
display(batting_df.tail(10))

print("\nBatting Random Sample:")
display(batting_df.sample(10, random_state=42))

print("\nBowling Head:")
display(bowling_df.head(10))

print("\nBowling Tail:")
display(bowling_df.tail(10))

print("\nBowling Random Sample:")
display(bowling_df.sample(10, random_state=42))

# -------------------------------
# DATA TYPES & INFO
# -------------------------------
print("\n" + "="*80)
print("DATA TYPES & INFO")
print("="*80)

print("\nBatting Info:")
batting_df.info()

print("\nBowling Info:")
bowling_df.info()

# -------------------------------
# MISSING VALUES
# -------------------------------
print("\n" + "="*80)
print("MISSING VALUES ANALYSIS")
print("="*80)

print("\nBatting Missing Values:")
print(batting_df.isnull().sum())

print("\nBowling Missing Values:")
print(bowling_df.isnull().sum())

print("\nBatting Missing Percentage:")
print((batting_df.isnull().mean() * 100).round(2))

print("\nBowling Missing Percentage:")
print((bowling_df.isnull().mean() * 100).round(2))

plt.figure(figsize=(12,4))
sns.heatmap(batting_df.isnull(), cbar=False)
plt.title("Missing Values Heatmap – Batting Data")
plt.show()

plt.figure(figsize=(12,4))
sns.heatmap(bowling_df.isnull(), cbar=False)
plt.title("Missing Values Heatmap – Bowling Data")
plt.show()

# -------------------------------
# STATISTICAL SUMMARY
# -------------------------------
print("\n" + "="*80)
print("STATISTICAL SUMMARY")
print("="*80)

display(batting_df.describe(include="all"))
display(bowling_df.describe(include="all"))

# -------------------------------
# UNIQUE VALUE ANALYSIS
# -------------------------------
print("\n" + "="*80)
print("UNIQUE VALUE COUNTS")
print("="*80)

print("\nBatting Dataset:")
for col in batting_df.columns:
    print(f"{col}: {batting_df[col].nunique()} unique values")

print("\nBowling Dataset:")
for col in bowling_df.columns:
    print(f"{col}: {bowling_df[col].nunique()} unique values")

# -------------------------------
# DUPLICATE CHECK
# -------------------------------
print("\n" + "="*80)
print("DUPLICATE RECORD CHECK")
print("="*80)

print("Batting Duplicates:", batting_df.duplicated().sum())
print("Bowling Duplicates:", bowling_df.duplicated().sum())

# -------------------------------
# TEAM & PLAYER DISTRIBUTION
# -------------------------------
print("\n" + "="*80)
print("TEAM DISTRIBUTION")
print("="*80)

print("\nPlayers per team (Batting):")
print(batting_df["Team"].value_counts())

print("\nPlayers per team (Bowling):")
print(bowling_df["Team"].value_counts())

# -------------------------------
# ALL-ROUNDER IDENTIFICATION
# -------------------------------
print("\n" + "="*80)
print("ALL-ROUNDERS IDENTIFICATION")
print("="*80)

common_players = set(batting_df["Player Name"]).intersection(
    set(bowling_df["Player Name"])
)

print("Number of All-Rounders:", len(common_players))
print("Sample All-Rounders:", list(common_players)[:20])

# -------------------------------
# DISTRIBUTION VISUALIZATION
# -------------------------------
print("\n" + "="*80)
print("DISTRIBUTION PLOTS")
print("="*80)

sns.histplot(batting_df["Runs"], bins=30, kde=True)
plt.title("Distribution of Runs Scored")
plt.show()

sns.histplot(bowling_df["WKT"], bins=20, kde=True)
plt.title("Distribution of Wickets Taken")
plt.show()

# -------------------------------
# CORRELATION ANALYSIS
# -------------------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

plt.figure(figsize=(8,6))
sns.heatmap(
    batting_df.select_dtypes(include=np.number).corr(),
    annot=True,
    cmap="coolwarm"
)
plt.title("Batting Data Correlation Heatmap")
plt.show()

# -------------------------------
# FINAL SUMMARY
# -------------------------------
print("\n" + "="*80)
print("DATASET PREVIEW SUMMARY")
print("="*80)
print("""
• Dataset is well-structured and suitable for analysis
• Numerical columns show good variability
• Minimal missing values observed
• All-rounder analysis is possible
• Data is ready for detailed EDA and visualization
""")

LOADING DATASETS
File 'IPL2025Batters.csv' not found. Please upload it.


KeyboardInterrupt: 