In [5]:
import numpy as np

In [7]:
import kagglehub

# Download latest version
path= kagglehub.dataset_download("tarktunataalt/breast-tissue-impedance-measurements")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/breast-tissue-impedance-measurements


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
import pandas as pd

In [14]:
import os

base_path = "/kaggle/input/breast-tissue-impedance-measurements"

for root, dirs, files in os.walk(base_path):
    for file in files:
        print(os.path.join(root, file))


/kaggle/input/breast-tissue-impedance-measurements/data.csv


In [16]:

df=pd.read_csv('/kaggle/input/breast-tissue-impedance-measurements/data.csv')
df

Unnamed: 0,Class,I0,PA500,HFS,DA,Area,A.DA,Max.IP,DR,P
0,car,524.794072,0.187448,0.032114,228.800228,6843.598481,29.910803,60.204880,220.737212,556.828334
1,car,330.000000,0.226893,0.265290,121.154201,3163.239472,26.109202,69.717361,99.084964,400.225776
2,car,551.879287,0.232478,0.063530,264.804935,11888.391827,44.894903,77.793297,253.785300,656.769449
3,car,380.000000,0.240855,0.286234,137.640111,5402.171180,39.248524,88.758446,105.198568,493.701814
4,car,362.831266,0.200713,0.244346,124.912559,3290.462446,26.342127,69.389389,103.866552,424.796503
...,...,...,...,...,...,...,...,...,...,...
101,adi,2000.000000,0.106989,0.105418,520.222649,40087.920984,77.059161,204.090347,478.517223,2088.648870
102,adi,2600.000000,0.200538,0.208043,1063.441427,174480.476218,164.071543,418.687286,977.552367,2664.583623
103,adi,1600.000000,0.071908,-0.066323,436.943603,12655.342135,28.963331,103.732704,432.129749,1475.371534
104,adi,2300.000000,0.045029,0.136834,185.446044,5086.292497,27.427344,178.691742,49.593290,2480.592151


In [28]:
X = df[['I0', 'PA500', 'HFS', 'Area', 'DA', 'DR']]
y = df['Class']

In [31]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=42)

In [33]:
pipeline=Pipeline([
    ('imputer',SimpleImputer()),
    ('scaler',StandardScaler()),
    ('model',RandomForestClassifier())
])

In [34]:
pipeline.fit(X_train, y_train)

In [35]:
y_pred=pipeline.predict(X_test)

In [36]:
print('predicted',y_pred)
print('actual',y_test)

predicted ['adi' 'car' 'car' 'con' 'adi' 'fad' 'mas' 'mas' 'adi' 'car' 'mas' 'mas'
 'car' 'con' 'car' 'fad' 'gla' 'con' 'gla' 'gla' 'car' 'con']
actual 100    adi
10     car
4      car
81     con
97     adi
65     gla
30     fad
33     fad
93     adi
11     car
47     mas
42     mas
0      car
78     con
18     car
64     gla
67     gla
79     con
55     gla
44     mas
12     car
80     con
Name: Class, dtype: object


In [37]:
print(accuracy_score(y_test,y_pred))

0.7727272727272727


In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
grid_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('model',RandomForestClassifier())
])

In [49]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt'],
    'model__bootstrap': [True]
}

In [50]:
grid=GridSearchCV(
    grid_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

In [52]:
grid.fit(X_train,y_train)
print('best cv score', grid.best_score_)
print('best parameter', grid.best_params_)

best cv score 0.6316176470588235
best parameter {'model__bootstrap': True, 'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}


In [54]:
test_sample = np.array([[
    380.0,   # I0
    -0.45,   # PA500
    0.82,    # HFS
    120.5,   # AREA
    35.0,    # DA
    18.2     # DR
]])
prediction=pipeline.predict(test_sample)
print(prediction)

['gla']




In [56]:
import joblib

In [58]:
joblib.dump(pipeline,'bresat_cancer_type.joblib')

['bresat_cancer_type.joblib']