# **DS1** **heart Dieases**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.over_sampling import SMOTE


**1. Load and Inspect Heart Disease Dataset**

In [2]:
# Load dataset (make sure the file exists in your working directory)
df = pd.read_csv("heart.csv")

# Show structure of dataset
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0    

**2. Normalize and Apply Clustering (Unsupervised Learning)**

In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop("target", axis=1))  # exclude label for unsupervised learning

# KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans_labels = kmeans.fit_predict(X_scaled)
sil_kmeans = silhouette_score(X_scaled, kmeans_labels)

# DBSCAN clustering
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)

# Handle case where DBSCAN assigns -1 (noise) to many points
try:
    sil_dbscan = silhouette_score(X_scaled, dbscan_labels)
except:
    sil_dbscan = "N/A (DBSCAN failed or all noise)"

print("Silhouette Score (KMeans):", sil_kmeans)
print("Silhouette Score (DBSCAN):", sil_dbscan)


Silhouette Score (KMeans): 0.16636832597727694
Silhouette Score (DBSCAN): -0.1597353544931689


**3. Feature Selection (Supervised Learning)**

In [4]:
X = df.drop("target", axis=1)
y = df["target"]

selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)


Selected Features: Index(['cp', 'thalach', 'exang', 'oldpeak', 'ca'], dtype='object')


**4. Handle Class Imbalance with SMOTE**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

# Check minimum class count for safe SMOTE
class_counts = np.unique(y_train, return_counts=True)[1]
min_class_count = min(class_counts)
k_neighbors = min(5, min_class_count - 1)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("Original Class Distribution:", dict(zip(*np.unique(y_train, return_counts=True))))
print("Balanced Class Distribution:", dict(zip(*np.unique(y_train_bal, return_counts=True))))


Original Class Distribution: {np.int64(0): np.int64(397), np.int64(1): np.int64(423)}
Balanced Class Distribution: {np.int64(0): np.int64(423), np.int64(1): np.int64(423)}


**5. Train and Evaluate Multiple Classifiers**

In [7]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    print(f"\n {name}")
    print("Accuracy:", model.score(X_test, y_test))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



 Logistic Regression
Accuracy: 0.7804878048780488
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.70      0.76       102
           1       0.74      0.86      0.80       103

    accuracy                           0.78       205
   macro avg       0.79      0.78      0.78       205
weighted avg       0.79      0.78      0.78       205

Confusion Matrix:
 [[71 31]
 [14 89]]

 Decision Tree
Accuracy: 0.9804878048780488
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       102
           1       0.99      0.97      0.98       103

    accuracy                           0.98       205
   macro avg       0.98      0.98      0.98       205
weighted avg       0.98      0.98      0.98       205

Confusion Matrix:
 [[101   1]
 [  3 100]]

 Random Forest
Accuracy: 0.9804878048780488
Classification Report:
               precision    recall  f1-score   support

# **DS2   Students Performance**

In [12]:
# Cell 1: Load and Explore Dataset
import pandas as pd
ds2 = pd.read_csv('Students Performance.csv')
ds2.head()

# Cell 2: Check for Nulls & Basic Info
ds2.info()
ds2.isnull().sum()

# Cell 3: Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ds2.columns:
    if ds2[col].dtype == 'object':
        ds2[col] = le.fit_transform(ds2[col])
ds2.head()

# Cell 4: Feature Extraction Techniques (PCA and LDA)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Define features and target
X = ds2.drop('Math_Score', axis=1)
y = ds2['Math_Score']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# LDA - create classes based on math score quantiles
y_class = pd.qcut(y, q=3, labels=[0, 1, 2])
lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(X_scaled, y_class)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Math_Score       399 non-null    int64
 1   Reading_Score    399 non-null    int64
 2   Writing_Score    399 non-null    int64
 3   Placement_Score  399 non-null    int64
 4   Club_Join_Date   399 non-null    int64
dtypes: int64(5)
memory usage: 15.7 KB


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X_pca, y_class, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classification Using PCA Features")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Classification Using PCA Features
              precision    recall  f1-score   support

           0       0.25      0.35      0.29        26
           1       0.17      0.09      0.12        32
           2       0.27      0.32      0.29        22

    accuracy                           0.24        80
   macro avg       0.23      0.25      0.23        80
weighted avg       0.22      0.24      0.22        80

[[ 9  9  8]
 [18  3 11]
 [ 9  6  7]]


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_lda, y_class, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classification Using LDA Features")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Classification Using LDA Features
              precision    recall  f1-score   support

           0       0.25      0.27      0.26        26
           1       0.30      0.25      0.27        32
           2       0.36      0.41      0.38        22

    accuracy                           0.30        80
   macro avg       0.30      0.31      0.30        80
weighted avg       0.30      0.30      0.30        80

[[ 7 12  7]
 [15  8  9]
 [ 6  7  9]]


**DS3**  CreditCard Fraud Detection

In [18]:
# Cell 1: Load and Clean Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load dataset
df = pd.read_csv('creditcard.csv')

# Drop rows where target is missing (just in case)
df = df.dropna(subset=['Class'])

# Drop rows with any missing values in features
df = df.dropna()

# Display basic info
print("Dataset Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("Class distribution:\n", df['Class'].value_counts())

# Cell 2: Define features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Cell 3: Train-Test Split & Before Balancing Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n Before Balancing")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Cell 4: Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n After SMOTE Balancing")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Cell 5: Apply RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n After RandomUnderSampler")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Dataset Shape: (284807, 31)
Missing values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Class distribution:
 Class
0    284315
1       492
Name: count, dtype: int64

 Before Balancing
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.77      0.86        98

    accuracy                           1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56862     2]
 [   23    75]]

 After SMOTE Balancing
              precision    recall  f1-score   support

           0      