In [None]:
! pip install ucimlrepo

In [1]:
import pandas as pd
import numpy as np

In [2]:
output_loc = "/repos/smote_msfb/public_datasets/yeast/"

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
yeast = fetch_ucirepo(id=110) 
  
# data (as pandas dataframes) 
X = yeast.data.features 
y = yeast.data.targets 

In [4]:
# metadata 
print(yeast.metadata) 
  
# variable information 
print(yeast.variables) 

{'uci_id': 110, 'name': 'Yeast', 'repository_url': 'https://archive.ics.uci.edu/dataset/110/yeast', 'data_url': 'https://archive.ics.uci.edu/static/public/110/data.csv', 'abstract': 'Predicting the Cellular Localization Sites of Proteins', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1484, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['localization_site'], 'index_col': ['Sequence_Name'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1991, 'last_updated': 'Sat Oct 28 2023', 'dataset_doi': '10.24432/C5KG68', 'creators': ['Kenta Nakai'], 'intro_paper': None, 'additional_info': {'summary': 'Predicted Attribute: Localization site of protein. ( non-numeric ).\r\n\r\nThe references below describe a predecessor to this dataset and its development. They also give results (not cross-validated) for classification by a rule-based expert system with that version of th

In [5]:
X.dtypes

mcg    float64
gvh    float64
alm    float64
mit    float64
erl    float64
pox    float64
vac    float64
nuc    float64
dtype: object

In [6]:
X.head(3)

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22


In [7]:
# Store the columns to binarize
cols = X.columns

# For each column, compute the 65th percentile and binarize, creating new columns
for col in cols:
    cutoff = X[col].quantile(0.65)
    X[col + '_bin'] = (X[col] > cutoff).astype(int)

# Drop the original columns
X_bin = X.drop(columns=cols)

X_bin.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,mcg_bin,gvh_bin,alm_bin,mit_bin,erl_bin,pox_bin,vac_bin,nuc_bin
0,1,1,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0
2,1,1,0,0,0,0,1,0


In [8]:
# Step 3: Rename all columns as f_*
X_bin.columns = [f'f_{i}' for i in range(X_bin.shape[1])]

In [9]:
X_bin.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7
0,1,1,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0
2,1,1,0,0,0,0,1,0
3,1,0,1,0,0,0,1,0
4,0,0,0,1,0,0,0,0


In [10]:
y.value_counts()

localization_site
CYT                  463
NUC                  429
MIT                  244
ME3                  163
ME2                   51
ME1                   44
EXC                   35
VAC                   30
POX                   20
ERL                    5
dtype: int64

In [11]:
# Assuming y is a DataFrame or Series with column 'localization_site'
target = (y['localization_site'] == 'ME2').astype(int).rename('target')

# Concatenate with features; target as last column
final_df = pd.concat([X_bin.reset_index(drop=True), target.reset_index(drop=True)], axis=1)

In [12]:
final_df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,target
0,1,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0
2,1,1,0,0,0,0,1,0,0
3,1,0,1,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0


In [13]:
final_df.to_csv(output_loc + "final_data.csv.zip", 
                     index=False, 
                     compression={'method': 'zip', 'archive_name': 'final_data.csv'})