In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from numpy import pi

In [4]:
df=pd.read_csv("/kaggle/input/darwin-asif/data.csv")
df=df.drop(['ID'],axis=1,inplace=False)
df.head()

Unnamed: 0,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,mean_jerk_in_air1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
0,5160,1.3e-05,120.804174,86.853334,957,6601,0.3618,0.217459,103.828754,0.051836,...,0.141434,0.024471,5.596487,3.184589,71,40120,1749.278166,296102.7676,144605,P
1,51980,1.6e-05,115.318238,83.448681,1694,6998,0.272513,0.14488,99.383459,0.039827,...,0.049663,0.018368,1.665973,0.950249,129,126700,1504.768272,278744.285,298640,P
2,2600,1e-05,229.933997,172.761858,2333,5802,0.38702,0.181342,201.347928,0.06422,...,0.178194,0.017174,4.000781,2.392521,74,45480,1431.443492,144411.7055,79025,P
3,2130,1e-05,369.403342,183.193104,1756,8159,0.556879,0.164502,276.298223,0.090408,...,0.113905,0.01986,4.206746,1.613522,123,67945,1465.843329,230184.7154,181220,P
4,2310,7e-06,257.997131,111.275889,987,4732,0.266077,0.145104,184.63651,0.037528,...,0.121782,0.020872,3.319036,1.680629,92,37285,1841.702561,158290.0255,72575,P


In [5]:
X=df.drop('class', axis=1)
y=df['class']

In [6]:
chi_ls = []

for feature in X.columns:
    
    # create contingency table
    c = pd.crosstab(y, X[feature])
    
    # chi_test
    p_value = chi2_contingency(c)[1]
    chi_ls.append(p_value)

In [7]:
selected = pd.Series(chi_ls, index=X.columns).sort_values(
    ascending=True)[0:4].index

selected

Index(['num_of_pendown23', 'num_of_pendown19', 'num_of_pendown18',
       'num_of_pendown4'],
      dtype='object')

In [8]:
X = X[selected]
X.head()

Unnamed: 0,num_of_pendown23,num_of_pendown19,num_of_pendown18,num_of_pendown4
0,12,43,3,11
1,14,82,6,5
2,12,42,4,1
3,11,78,6,9
4,12,53,4,9


In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 12))
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [10]:
print("Sample of scaled data:")
print(X_scaled_df.head())
print(f"Min value after scaling: {X_scaled_df.min().min()}")
print(f"Max value after scaling: {X_scaled_df.max().max()}")

Sample of scaled data:
   num_of_pendown23  num_of_pendown19  num_of_pendown18  num_of_pendown4
0               2.4              4.08          0.418605         1.578947
1               4.0              8.76          0.837209         0.631579
2               2.4              3.96          0.558140         0.000000
3               1.6              8.28          0.837209         1.263158
4               2.4              5.28          0.558140         1.263158
Min value after scaling: 0.0
Max value after scaling: 12.000000000000004


In [11]:
X_selected=X_scaled_df

In [12]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=2, kernel='rbf')
X_kpca = kpca.fit_transform(X_selected)

print(f"Shape after Kernel PCA: {X_kpca.shape}")

Shape after Kernel PCA: (174, 2)


In [13]:
reduced_df = pd.DataFrame(X_kpca, columns=['KPCA1', 'KPCA2'])
reduced_df['target'] = y

# Create kaggle directory if it doesn't exist
kaggle_dir = '/kaggle/working/'
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)

In [14]:
csv_path = os.path.join(kaggle_dir, 'KPCA_darwin_reduced.csv')
reduced_df.to_csv(csv_path, index=False)
print(f"Reduced dataset saved to: {csv_path}")

Reduced dataset saved to: /kaggle/working/KPCA_darwin_reduced.csv


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_kpca, y, test_size=0.2, random_state=42
)
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (139, 2)
Testing set shape: (35, 2)


In [17]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)

In [18]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Get predictions and probability scores
y_pred = svm.predict(X_test)
y_prob = svm.decision_function(X_test)  # For SVM, we use decision_function instead of predict_proba

# Calculate accuracy and AUC
accuracy = accuracy_score(y_test, y_pred)
auc_value = roc_auc_score(y_test, y_prob)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.6286
Classification Report:
              precision    recall  f1-score   support

           H       0.56      0.67      0.61        15
           P       0.71      0.60      0.65        20

    accuracy                           0.63        35
   macro avg       0.63      0.63      0.63        35
weighted avg       0.64      0.63      0.63        35



# Classical Ensemble

In [19]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [20]:
base_models = [
    ('svm', SVC(probability=False, random_state=41)),
    ('tree', DecisionTreeClassifier(random_state=42))
]

In [21]:
meta_model = LogisticRegression()

# Stacking ensemble
stacking = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking.fit(X_train, y_train)

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns

# Predict on the test set
y_pred = stacking.predict(X_test)
y_pred_proba = stacking.predict_proba(X_test)[:, 1]  # Probabilities for positive class

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           H       0.58      0.73      0.65        15
           P       0.75      0.60      0.67        20

    accuracy                           0.66        35
   macro avg       0.66      0.67      0.66        35
weighted avg       0.68      0.66      0.66        35

