In [None]:
# ============================================================
# Q1.
# (a) Create a 1D NumPy array with values 1 to 20
# (b) Extract all prime numbers
# (c) Compute mean and variance of primes
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

arr = np.arange(1, 21)

def is_prime(n):
    if n < 2:
        return False
    for i in range(2, int(np.sqrt(n)) + 1):
        if n % i == 0:
            return False
    return True

primes = np.array([x for x in arr if is_prime(x)])
print("Q1 Primes:", primes)
print("Mean:", np.mean(primes))
print("Variance:", np.var(primes))


# ============================================================
# Q2.
# (a) Create a 4×4 array (1–16)
# (b) Extract 2×2 bottom-left sub-matrix
# (c) Compute determinant
# ============================================================

A = np.arange(1, 17).reshape(4, 4)
sub = A[2:4, 0:2]
print("\nQ2 Sub-matrix:\n", sub)
print("Determinant:", np.linalg.det(sub))


# ============================================================
# Q3.
# Student DataFrame – total, average, topper
# ============================================================

students = pd.DataFrame({
    "Name": ["A", "B", "C", "D", "E"],
    "Math": [80, 75, 90, 60, 85],
    "Science": [70, 88, 95, 65, 80],
    "English": [75, 70, 85, 60, 90]
})

students["Total"] = students.iloc[:, 1:4].sum(axis=1)
students["Average"] = students["Total"] / 3
topper = students.loc[students["Average"].idxmax()]

print("\nQ3 Student Table:\n", students)
print("Topper:", topper["Name"], "Avg:", topper["Average"])


# ============================================================
# Q4.
# Coin toss simulation (1000 tosses)
# ============================================================

tosses = np.random.randint(0, 2, 1000)
heads = np.sum(tosses == 1)
tails = np.sum(tosses == 0)

print("\nQ4 Heads:", heads, "Tails:", tails)
print("Probability of Heads:", heads / 1000)


# ============================================================
# Q5.
# Employee bonus & salary above average
# ============================================================

emp = pd.DataFrame({
    "ID": [1, 2, 3, 4],
    "Name": ["A", "B", "C", "D"],
    "Salary": [50000, 60000, 55000, 70000]
})

emp["Bonus"] = emp["Salary"] * 0.10
print("\nQ5 Employees above avg salary:\n",
      emp[emp["Salary"] > emp["Salary"].mean()])


# ============================================================
# Q6.
# 3×3 matrix transpose, inverse & verification
# ============================================================

M = np.arange(1, 10).reshape(3, 3)
M_inv = np.linalg.inv(M)
identity = np.dot(M, M_inv)

print("\nQ6 Identity Check:\n", identity)


# ============================================================
# Q7.
# Daily temperature analysis
# ============================================================

temps = np.random.randint(20, 41, 30)
print("\nQ7 Hottest:", temps.max())
print("Coldest:", temps.min())
print("Mean:", np.mean(temps))
print("Median:", np.median(temps))
print("Std Dev:", np.std(temps))


# ============================================================
# Q8.
# Pandas Series – Pass/Fail
# ============================================================

marks = pd.Series([78, 45, 30, 90, 55, 20, 60, 35])
marks = marks.where(marks >= 40, "Fail")

print("\nQ8 Series:\n", marks)
print("Passed Count:", (marks != "Fail").sum())


# ============================================================
# Q9.
# Dice simulation (500 rolls)
# ============================================================

dice = np.random.randint(1, 7, 500)
counts = pd.Series(dice).value_counts().sort_index()
relative = counts / 500

print("\nQ9 Dice Counts:\n", counts)
print("Relative Frequency:\n", relative)


# ============================================================
# Q10.
# Product sales & max revenue
# ============================================================

products = pd.DataFrame({
    "Name": ["P1", "P2", "P3", "P4", "P5", "P6"],
    "Quantity": [10, 5, 8, 12, 7, 6],
    "Price": [100, 200, 150, 80, 300, 250]
})

products["Total"] = products["Quantity"] * products["Price"]
print("\nQ10 Max Revenue Product:\n",
      products.loc[products["Total"].idxmax()])


# ============================================================
# Q11.
# Missing values handling
# ============================================================

df_nan = pd.DataFrame({
    "A": [1, np.nan, 3],
    "B": [4, 5, np.nan],
    "C": [np.nan, 8, 9]
})

print("\nQ11 Filled:\n", df_nan.fillna(df_nan.mean()))
print("Dropped:\n", df_nan.dropna(thresh=2))


# ============================================================
# Q12.
# Reshape & even number average
# ============================================================

arr = np.arange(1, 31)
mat = arr.reshape(5, 6)
evens = mat[mat % 2 == 0]

print("\nQ12 Even Avg:", evens.mean())


# ============================================================
# Q13.
# Student filtering
# ============================================================

stud = pd.DataFrame({
    "Name": ["A", "B", "C", "D", "E", "F"],
    "Age": [18, 21, 19, 22, 17, 20],
    "Marks": [65, 75, 55, 80, 90, 60]
})

print("\nQ13 Above Avg:\n",
      stud[stud["Marks"] > stud["Marks"].mean()])
print("Young & High:\n",
      stud[(stud["Age"] < 20) & (stud["Marks"] > 60)])


# ============================================================
# Q14.
# Employee efficiency
# ============================================================

emp2 = pd.DataFrame({
    "Name": ["A", "B", "C", "D", "E"],
    "Tasks_Completed": [40, 35, 50, 45, 30],
    "Hours_Worked": [8, 7, 10, 9, 6]
})

emp2["Efficiency"] = emp2["Tasks_Completed"] / emp2["Hours_Worked"]
print("\nQ14 Best Employee:\n",
      emp2.loc[emp2["Efficiency"].idxmax()])


# ============================================================
# CASE STUDY 1 – Student Performance Analytics
# ============================================================

data = pd.DataFrame(np.random.randint(40, 100, (100, 3)),
                    columns=["Math", "Science", "English"])

data["Average"] = data.mean(axis=1)
print("\nCase Study 1 – Top 5:\n", data.nlargest(5, "Average"))
print("Subject Stats:\n", data.iloc[:, :3].agg(["mean", "std"]))

data.iloc[:, :3].boxplot()
plt.show()


# ============================================================
# CASE STUDY 2 – Sales Data Exploration
# ============================================================

sales = pd.DataFrame({
    "Product": ["A", "B", "C", "A", "B"],
    "Quantity": [10, 5, 8, 7, 6],
    "Price": [100, 200, 150, 100, 200],
    "Region": ["East", "West", "East", "North", "West"]
})

sales["Total"] = sales["Quantity"] * sales["Price"]
print("\nCase Study 2 Revenue:", sales["Total"].sum())
print("Region-wise:\n", sales.groupby("Region")["Total"].sum())

sales.groupby("Product")["Total"].sum().plot(kind="hist")
plt.show()


# ============================================================
# CASE STUDY 3 – Auto MPG (Conceptual)
# ============================================================

# Correlation: mpg vs weight → strong negative correlation
# Scatter plot: weight ↑ → mpg ↓


In [None]:
# ============================================================
# Q1. Predicting Car MPG using Linear Regression
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.utils import resample

# ------------------------------------------------------------
# (a) Load dataset and remove missing values
# ------------------------------------------------------------

auto = pd.read_csv("auto_mpg.csv")
auto.dropna(inplace=True)

# ------------------------------------------------------------
# (b) Identify predictors and target
# ------------------------------------------------------------

X = auto.drop("mpg", axis=1)
y = auto["mpg"]

# ------------------------------------------------------------
# (c) Train-test split (80/20)
# ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------------
# (d) Train Linear Regression model & predict
# ------------------------------------------------------------

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# ------------------------------------------------------------
# (e) Model evaluation
# ------------------------------------------------------------

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nQ1 MSE:", mse)
print("Q1 R2 Score:", r2)

# ------------------------------------------------------------
# (f) Discussion:
# If R² = 0.85 → 85% of variance in mpg explained by model
# ------------------------------------------------------------


# ============================================================
# Q2. Bootstrap Sampling for Uncertainty Estimation
# ============================================================

# ------------------------------------------------------------
# (a) Load dataset & extract feature columns
# ------------------------------------------------------------

btissue = pd.read_csv("btissue.csv")
X_bt = btissue.iloc[:, :-1]

# ------------------------------------------------------------
# (b) Bootstrap sample of 100 observations
# ------------------------------------------------------------

bootstrap_sample = resample(X_bt, n_samples=100, replace=True, random_state=42)

# ------------------------------------------------------------
# (c) Show first 10 rows & check duplicates
# ------------------------------------------------------------

print("\nQ2 Bootstrap Sample (first 10 rows):\n", bootstrap_sample.head(10))
print("Repeated rows exist:", bootstrap_sample.duplicated().any())


# ============================================================
# Q3. 5-Fold Cross-Validation Index Analysis
# ============================================================

kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("\nQ3 K-Fold Splits:")
for fold, (train_idx, test_idx) in enumerate(kf.split(X_bt)):
    print(f"Fold {fold+1}")
    print("Train size:", len(train_idx))
    print("Test size:", len(test_idx))
    print("Train indices:", train_idx[:10], "...")
    print("Test indices:", test_idx[:10], "...\n")

# Each fold uses a unique test set; together they cover full dataset


# ============================================================
# Q4. Model Evaluation: Holdout vs Cross-Validation
# ============================================================

X = btissue.iloc[:, :-1]
y = btissue.iloc[:, -1]

model = DecisionTreeClassifier(random_state=42)

# ------------------------------------------------------------
# (i) Holdout Validation (80/20)
# ------------------------------------------------------------

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_tr, y_tr)
holdout_acc = model.score(X_te, y_te)

# ------------------------------------------------------------
# (ii) 5-Fold Cross-Validation
# ------------------------------------------------------------

cv_scores = cross_val_score(model, X, y, cv=5)
cv_acc = cv_scores.mean()

print("\nQ4 Holdout Accuracy:", holdout_acc)
print("Q4 CV Accuracy:", cv_acc)


# ============================================================
# Q5. Feature Creation from Structured Data
# ============================================================

data = pd.DataFrame({
    "Age": [22, 35, 45, 28, 50],
    "Income": [30000, 60000, 80000, 45000, 90000],
    "Spending": [2000, 4000, 3000, 2500, 5000]
})

# ------------------------------------------------------------
# (a) Feature Engineering
# ------------------------------------------------------------

data["Age_Group"] = pd.cut(data["Age"], bins=[18, 30, 45, 60],
                           labels=["Young", "Middle", "Senior"])

data["Income_Spending_Ratio"] = data["Income"] / data["Spending"]
data["Normalized_Spending"] = (
    data["Spending"] - data["Spending"].mean()
) / data["Spending"].std()

print("\nQ5 Feature Engineered Data:\n", data)

# ------------------------------------------------------------
# (b) Plot correlation
# ------------------------------------------------------------

data.corr(numeric_only=True).plot(kind="bar")
plt.title("Feature Correlation")
plt.show()


# ============================================================
# Q6. Decision Tree on Iris Dataset (Feature Subset Selection)
# ============================================================

# ------------------------------------------------------------
# (a) Load Iris dataset
# ------------------------------------------------------------

iris = load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris["target"] = iris.target

# ------------------------------------------------------------
# (b) Display first rows
# ------------------------------------------------------------

print("\nQ6 Iris Data:\n", df_iris.head())

# ------------------------------------------------------------
# (c) Train with all features
# ------------------------------------------------------------

X = df_iris.iloc[:, :-1]
y = df_iris["target"]

dt_all = DecisionTreeClassifier(random_state=42)
dt_all.fit(X, y)
acc_all = dt_all.score(X, y)

# ------------------------------------------------------------
# (d) Select subset using iloc
# ------------------------------------------------------------

X_subset = df_iris.iloc[:, :2]

# ------------------------------------------------------------
# (e) Train with selected features
# ------------------------------------------------------------

dt_sub = DecisionTreeClassifier(random_state=42)
dt_sub.fit(X_subset, y)
acc_sub = dt_sub.score(X_subset, y)

print("\nAccuracy (All features):", acc_all)
print("Accuracy (Subset features):", acc_sub)

# ------------------------------------------------------------
# (f) Discussion:
# All features generally give better accuracy
# ------------------------------------------------------------


# ============================================================
# Q7. PCA on Iris Dataset
# ============================================================

# ------------------------------------------------------------
# (a & b) PCA transformation
# ------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(iris.data)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# ------------------------------------------------------------
# (c) Create PCA DataFrame
# ------------------------------------------------------------

pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["target"] = iris.target

# ------------------------------------------------------------
# (d) Scatter plot
# ------------------------------------------------------------

for label in np.unique(iris.target):
    plt.scatter(
        pca_df[pca_df["target"] == label]["PC1"],
        pca_df[pca_df["target"] == label]["PC2"],
        label=iris.target_names[label]
    )

plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.title("PCA on Iris Dataset")
plt.show()

print("\nExplained Variance Ratio:", pca.explained_variance_ratio_)


# ============================================================
# Q8. Encoding Categorical Variables
# ============================================================

# ------------------------------------------------------------
# (a) Create dataset
# ------------------------------------------------------------

emp = pd.DataFrame({
    "Department": ["HR", "IT", "Finance", "IT", "HR"],
    "Job_Role": ["Manager", "Analyst", "Clerk", "Manager", "Analyst"],
    "Marital_Status": ["Single", "Married", "Divorced", "Single", "Married"]
})

# ------------------------------------------------------------
# (b) Display original data
# ------------------------------------------------------------

print("\nQ8 Original Data:\n", emp)

# ------------------------------------------------------------
# (c) Encoding
# ------------------------------------------------------------

le = LabelEncoder()
emp["Marital_Status"] = le.fit_transform(emp["Marital_Status"])

emp_encoded = pd.get_dummies(emp, columns=["Department", "Job_Role"])

print("\nEncoded Dataset:\n", emp_encoded)


In [None]:
# ============================================================
# Q1. Generate Bernoulli and Binomial distributions using NumPy
#     and calculate their mean and variance
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------- Bernoulli Distribution --------
p = 0.6                     # probability of success
bernoulli = np.random.binomial(1, p, size=1000)

print("Q1 Bernoulli Distribution")
print("Mean:", np.mean(bernoulli))
print("Variance:", np.var(bernoulli))

# -------- Binomial Distribution --------
n = 10                      # number of trials
binomial = np.random.binomial(n, p, size=1000)

print("\nQ1 Binomial Distribution")
print("Mean:", np.mean(binomial))
print("Variance:", np.var(binomial))


# ============================================================
# Q2. Generate random samples from a Normal distribution
#     and plot its Probability Density Function (PDF)
# ============================================================

from scipy.stats import norm

mu = 0
sigma = 1

samples = np.random.normal(mu, sigma, 1000)

# Plot histogram
plt.hist(samples, bins=30, density=True, alpha=0.6)

# Plot PDF
x = np.linspace(-4, 4, 100)
plt.plot(x, norm.pdf(x, mu, sigma))

plt.title("Normal Distribution PDF")
plt.xlabel("Value")
plt.ylabel("Density")
plt.show()


# ============================================================
# Q3. Compute covariance and correlation coefficient
#     between two random variables
# ============================================================

X = np.random.randn(100)
Y = 2 * X + np.random.randn(100)

# Covariance
cov_matrix = np.cov(X, Y)
cov_xy = cov_matrix[0, 1]

# Correlation
corr_xy = np.corrcoef(X, Y)[0, 1]

print("\nQ3 Covariance:", cov_xy)
print("Q3 Correlation Coefficient:", corr_xy)


# ============================================================
# Q4. Verify Central Limit Theorem (CLT)
#     by plotting distribution of sample means
# ============================================================

population = np.random.exponential(scale=2, size=10000)

sample_means = []
sample_size = 30

for _ in range(1000):
    sample = np.random.choice(population, sample_size)
    sample_means.append(np.mean(sample))

plt.hist(sample_means, bins=30, density=True)
plt.title("Verification of CLT (Distribution of Sample Means)")
plt.xlabel("Sample Mean")
plt.ylabel("Density")
plt.show()


# ============================================================
# Q5. Implement kNN Classifier
#     Dataset: Iris.csv / apndcts.csv / btissue.csv
# ============================================================

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------
# Load dataset (change filename as needed)
# ------------------------------------------------------------

df = pd.read_csv("Iris.csv")      # or apndcts.csv / btissue.csv

# ------------------------------------------------------------
# Separate predictors and target
# (Assuming last column is target)
# ------------------------------------------------------------

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# ------------------------------------------------------------
# Feature scaling (important for kNN)
# ------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ------------------------------------------------------------
# Train-test split
# ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------------
# Train kNN classifier
# ------------------------------------------------------------

k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

# ------------------------------------------------------------
# Prediction
# ------------------------------------------------------------

y_pred = knn.predict(X_test)

# ------------------------------------------------------------
# Accuracy
# ------------------------------------------------------------

accuracy = accuracy_score(y_test, y_pred)
print("\nQ5 kNN Accuracy:", accuracy)
