Do not run all the code together. For every topic run the code separately.

In [24]:
import pandas as pd
import numpy as np

In [25]:
df= pd.read_csv('Iris.csv')
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [26]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
X = df.drop(['Id','Species'], axis=1)
y = df['Species']
y.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [28]:
#correlation matrix
corr_matrix = X.corr()
print(corr_matrix)

               SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
SepalLengthCm       1.000000     -0.109369       0.871754      0.817954
SepalWidthCm       -0.109369      1.000000      -0.420516     -0.356544
PetalLengthCm       0.871754     -0.420516       1.000000      0.962757
PetalWidthCm        0.817954     -0.356544       0.962757      1.000000


In [29]:
#upper triangular matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
SepalLengthCm,,-0.109369,0.871754,0.817954
SepalWidthCm,,,-0.420516,-0.356544
PetalLengthCm,,,,0.962757
PetalWidthCm,,,,


In [30]:
# select the feature which will be dropped. One of the feature from highly correlated features will drop.
feature_drop=[]
for column in upper.columns:
  if any(upper[column].abs() > 0.9):
    feature_drop.append(column)

feature_drop

['PetalWidthCm']

In [31]:
selected_features = X.drop(columns=feature_drop)
print("Selected features:", selected_features.columns.tolist())

Selected features: ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm']


In [32]:
# for chi square test target is categorical So, do not use label encoder
from sklearn.feature_selection import SelectKBest, chi2
chi2_selector = SelectKBest(score_func=chi2, k=3)
chi2_selector.fit(X, y)

chi2_scores = chi2_selector.scores_
for col, score in zip(X.columns, chi2_scores):
    print(f"{col}: {score}")

selected_features = X.columns[chi2_selector.get_support()]
print("Selected features:", selected_features.tolist())



SepalLengthCm: 10.817820878494008
SepalWidthCm: 3.594499017681736
PetalLengthCm: 116.16984746363968
PetalWidthCm: 67.24482758620695
Selected features: ['SepalLengthCm', 'PetalLengthCm', 'PetalWidthCm']


In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

# for ANOVA test target have to be numerical that is why label encoder is used
le = LabelEncoder()
y = le.fit_transform(y)

anova_selector = SelectKBest(score_func=f_classif, k=3)
anova_selector.fit(X, y)

anova_scores = anova_selector.scores_
for col, score in zip(X.columns, anova_scores):
    print(f"{col}: {score}")

selected_features = X.columns[anova_selector.get_support()]
print("Selected features:", selected_features.tolist())



SepalLengthCm: 119.26450218449871
SepalWidthCm: 47.364461402993975
PetalLengthCm: 1179.0343277002278
PetalWidthCm: 959.3244057257541
Selected features: ['SepalLengthCm', 'PetalLengthCm', 'PetalWidthCm']


In [35]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif


mutual_info_selector = SelectKBest(score_func=mutual_info_classif, k=3)
mutual_info_selector.fit(X, y)

mutual_info_scores = mutual_info_selector.scores_
for col, score in zip(X.columns, mutual_info_scores):
    print(f"{col}: {score}")
selected_features = X.columns[mutual_info_selector.get_support()]
print("Selected features:", selected_features.tolist())

SepalLengthCm: 0.491388181417189
SepalWidthCm: 0.23652603918884996
PetalLengthCm: 0.9857953229922232
PetalWidthCm: 0.9790401113763305
Selected features: ['SepalLengthCm', 'PetalLengthCm', 'PetalWidthCm']


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

rfe = RFE(model, n_features_to_select=3)
rfe = rfe.fit(X, y)

print("Selected features:", list(X.columns[rfe.support_]))

Selected features: ['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X, y)

importance = pd.Series(tree.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importance)
selected_features = importance[importance > 0.05].index.tolist()
print("Selected features:", selected_features)

PetalWidthCm     0.922611
PetalLengthCm    0.050723
SepalWidthCm     0.013333
SepalLengthCm    0.013333
dtype: float64
Selected features: ['PetalWidthCm', 'PetalLengthCm']


In [None]:
# PCA for dimentionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [None]:
df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df_pca['Target'] = y
df_pca


Unnamed: 0,PC1,PC2,Target
0,-2.684207,0.326607,0
1,-2.715391,-0.169557,0
2,-2.889820,-0.137346,0
3,-2.746437,-0.311124,0
4,-2.728593,0.333925,0
...,...,...,...
145,1.944017,0.187415,2
146,1.525664,-0.375021,2
147,1.764046,0.078519,2
148,1.901629,0.115877,2


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# model=DecisionTreeClassifier(random_state=42)
# model=RandomForestClassifier(n_estimators=100, random_state=42)
# model=KNeighborsClassifier(n_neighbors=3)
model=SVC(kernel='linear', random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)





Accuracy: 1.0000
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Cross-validation scores: [0.96666667 1.         0.96666667 0.96666667 1.        ]


In [None]:
#choosing best value of K by evaluating through cross validation
k_range = range(1, 20)
cv_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())
best_k = k_range[np.argmax(cv_scores)]
best_score = max(cv_scores)

print("k:", best_k)
print("cross-validation accuracy:", best_score)

k: 6
cross-validation accuracy: 0.9800000000000001


In [None]:
# Check overfit
train_pred=model.predict(X_train)
test_pred=model.predict(X_test)
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

if train_accuracy - test_accuracy > 0.10:
    print("overfitting")
else:
    print("not overfitting")

not overfitting
