In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/pgprnon-pgpr/pgpr_non_pgpr_clean.csv')

In [3]:
df.shape

(86446, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86446 entries, 0 to 86445
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         86446 non-null  int64  
 1   Orientation        86446 non-null  object 
 2   Protein accession  86446 non-null  object 
 3   Protein name       86446 non-null  object 
 4   Protein length     86446 non-null  float64
 5   DNA_sequence       86446 non-null  object 
 6   Target             86446 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 4.6+ MB


In [5]:
df.isnull().sum()

Unnamed: 0           0
Orientation          0
Protein accession    0
Protein name         0
Protein length       0
DNA_sequence         0
Target               0
dtype: int64

In [6]:
df = df.drop(columns=['Unnamed: 0', 'Protein accession', 'DNA_sequence'], axis= 1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86446 entries, 0 to 86445
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Orientation     86446 non-null  object 
 1   Protein name    86446 non-null  object 
 2   Protein length  86446 non-null  float64
 3   Target          86446 non-null  float64
dtypes: float64(2), object(2)
memory usage: 2.6+ MB


In [8]:
df['Protein name'].nunique()

14048

In [9]:
df.Target.unique()

array([1., 0.])

In [10]:
# Veri setindeki yinelenen gözlemleri kontrol eder ve bunları kaldırır

def duplicate_values(df):
    print("Duplicate check...")
    num_duplicates = df.duplicated(subset=None, keep='first').sum()
    if num_duplicates > 0:
        print("There are", num_duplicates, "duplicated observations in the dataset.")
        df.drop_duplicates(keep='first', inplace=True)
        print(num_duplicates, "duplicates were dropped!")
        print("No more duplicate rows!")
    else:
        print("There are no duplicated observations in the dataset.")

In [11]:
#duplicate_values(df)

In [12]:
X = df.drop('Target',axis=1)
y = df['Target']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=11)

In [14]:
ohe = X_train.select_dtypes("object").columns
num = X_train.select_dtypes("float64").columns

In [15]:
num

Index(['Protein length'], dtype='object')

In [16]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

ohe_enc = OneHotEncoder(handle_unknown='ignore') #OneHotEncoder
std_sca = StandardScaler()
column_trans = make_column_transformer((ohe_enc, ohe),(std_sca, num), remainder='passthrough')

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

operations = [("OneHotEncoder", column_trans),
              ("RF_model", RandomForestClassifier(random_state=11))]

pipe_model = Pipeline(steps=operations)

pipe_model.fit(X_train, y_train)

In [18]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

In [19]:
def calculate_results(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred)
    model_precision, model_recall, model_f1,_ = precision_recall_fscore_support(y_true, y_pred,average="weighted")
    model_results = {"accuracy":model_accuracy,
                     "precision":model_precision,
                     "recall" :model_recall,
                     "f1":model_f1}
    return model_results

In [20]:
calculate_results(y_true=y_test,
                  y_pred=(pipe_model.predict(X_test)))

{'accuracy': 0.6910931174089069,
 'precision': 0.6917918484995064,
 'recall': 0.6910931174089069,
 'f1': 0.6908134549492512}

In [21]:
column_trans.get_feature_names_out()

array(['onehotencoder__Orientation_minus',
       'onehotencoder__Orientation_plus',
       'onehotencoder__Protein name_(2,3-dihydroxybenzoyl)adenylate synthase',
       ...,
       'onehotencoder__Protein name_zincin-like metallopeptidase domain-containing protein',
       'onehotencoder__Protein name_zonular occludens toxin domain-containing protein',
       'standardscaler__Protein length'], dtype=object)

In [22]:
pipe_model["RF_model"].feature_importances_ 

array([8.61534571e-04, 6.77444891e-04, 3.25929779e-05, ...,
       5.49067620e-05, 1.99193182e-05, 3.20117965e-01])

In [23]:
features = pipe_model["OneHotEncoder"].get_feature_names_out()
features

array(['onehotencoder__Orientation_minus',
       'onehotencoder__Orientation_plus',
       'onehotencoder__Protein name_(2,3-dihydroxybenzoyl)adenylate synthase',
       ...,
       'onehotencoder__Protein name_zincin-like metallopeptidase domain-containing protein',
       'onehotencoder__Protein name_zonular occludens toxin domain-containing protein',
       'standardscaler__Protein length'], dtype=object)

In [24]:
new_features = [i.replace("onehotencoder__","").replace("remainder__", "") for i in features]
new_features

['Orientation_minus',
 'Orientation_plus',
 'Protein name_(2,3-dihydroxybenzoyl)adenylate synthase',
 'Protein name_(2,3-dihydroxybenzoyl)adenylate synthase EntE',
 'Protein name_(2E,6E)-farnesyl diphosphate synthase',
 'Protein name_(2E,6E)-farnesyl-diphosphate-specific ditrans,polycis-undecaprenyl-diphosphate synthase',
 'Protein name_(2Fe-2S)-binding protein',
 'Protein name_(3,5-dihydroxyphenyl)acetyl-CoA 1,2-dioxygenase DpgC',
 'Protein name_(3R)-hydroxymyristoyl-ACP dehydratase',
 'Protein name_(4S)-4-hydroxy-5-phosphonooxypentane-2,3-dione isomerase',
 'Protein name_(E)-4-hydroxy-3-methylbut-2-enyl-diphosphate synthase (flavodoxin)',
 'Protein name_(Fe-S)-binding protein',
 'Protein name_(Na+)-NQR maturation NqrM',
 'Protein name_(S)-acetoin forming diacetyl reductase',
 'Protein name_(S)-benzoin forming benzil reductase',
 'Protein name_(S)-ureidoglycine aminohydrolase',
 'Protein name_(d)CMP kinase',
 'Protein name_(dimethylallyl)adenosine tRNA methylthiotransferase',
 'Protei

In [25]:
df_fi = pd.DataFrame(data = pipe_model["RF_model"].feature_importances_, index=new_features, #index=X.columns
                      columns = ["Feature Importance"])
df_fi = df_fi.sort_values("Feature Importance", ascending=False)

df_fi.head(10)

Unnamed: 0,Feature Importance
standardscaler__Protein length,0.320118
Protein name_hypothetical protein,0.029448
Protein name_LysR family transcriptional regulator,0.003857
Protein name_acyl-CoA dehydrogenase family protein,0.001291
Protein name_helix-turn-helix domain-containing protein,0.001177
Protein name_MFS transporter,0.001154
Protein name_ABC transporter ATP-binding protein,0.001054
Protein name_TetR/AcrR family transcriptional regulator,0.001051
Protein name_cytochrome P450,0.000988
Protein name_ATP-binding protein,0.000941
