In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/pgprnon-pgpr/pgpr_non_pgpr_clean.csv')

In [3]:
df.shape

(86446, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86446 entries, 0 to 86445
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         86446 non-null  int64  
 1   Orientation        86446 non-null  object 
 2   Protein accession  86446 non-null  object 
 3   Protein name       86446 non-null  object 
 4   Protein length     86446 non-null  float64
 5   DNA_sequence       86446 non-null  object 
 6   Target             86446 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 4.6+ MB


In [5]:
df.isnull().sum()

Unnamed: 0           0
Orientation          0
Protein accession    0
Protein name         0
Protein length       0
DNA_sequence         0
Target               0
dtype: int64

In [6]:
df = df.drop(columns=['Unnamed: 0', 'Protein accession', 'DNA_sequence'], axis= 1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86446 entries, 0 to 86445
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Orientation     86446 non-null  object 
 1   Protein name    86446 non-null  object 
 2   Protein length  86446 non-null  float64
 3   Target          86446 non-null  float64
dtypes: float64(2), object(2)
memory usage: 2.6+ MB


In [8]:
df['Protein name'].nunique()

14048

In [9]:
df.Target.unique()

array([1., 0.])

In [10]:
# Veri setindeki yinelenen gözlemleri kontrol eder ve bunları kaldırır

def duplicate_values(df):
    print("Duplicate check...")
    num_duplicates = df.duplicated(subset=None, keep='first').sum()
    if num_duplicates > 0:
        print("There are", num_duplicates, "duplicated observations in the dataset.")
        df.drop_duplicates(keep='first', inplace=True)
        print(num_duplicates, "duplicates were dropped!")
        print("No more duplicate rows!")
    else:
        print("There are no duplicated observations in the dataset.")

In [11]:
duplicate_values(df)

Duplicate check...
There are 28226 duplicated observations in the dataset.
28226 duplicates were dropped!
No more duplicate rows!


In [12]:
X = df.drop('Target',axis=1)
y = df['Target']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=11)

In [14]:
ohe = X_train.select_dtypes("object").columns

In [15]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

ohe_enc = OneHotEncoder(handle_unknown='ignore') #OneHotEncoder
column_trans = make_column_transformer((ohe_enc, ohe), remainder='passthrough')

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

operations = [("OneHotEncoder", column_trans),
              ("RF_model", RandomForestClassifier(random_state=11))]

pipe_model = Pipeline(steps=operations)

pipe_model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

In [18]:
def calculate_results(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred)
    model_precision, model_recall, model_f1,_ = precision_recall_fscore_support(y_true, y_pred,average="weighted")
    model_results = {"accuracy":model_accuracy,
                     "precision":model_precision,
                     "recall" :model_recall,
                     "f1":model_f1}
    return model_results

In [20]:
calculate_results(y_true=y_test,
                  y_pred=(pipe_model.predict(X_test)))

{'accuracy': 0.563723806252147,
 'precision': 0.5496897369578281,
 'recall': 0.563723806252147,
 'f1': 0.5510380129179525}

In [21]:
from xgboost import XGBClassifier

In [22]:
operations = [("OneHotEncoder", column_trans),
              ("XGB_model", XGBClassifier(random_state=11))]

pipe_model = Pipeline(steps=operations)

pipe_model.fit(X_train, y_train)

In [23]:
calculate_results(y_true=y_test,
                  y_pred=(pipe_model.predict(X_test)))

{'accuracy': 0.5951563036757128,
 'precision': 0.6221172858893268,
 'recall': 0.5951563036757128,
 'f1': 0.47393217369062074}

# ANN

In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from keras.layers import Dense, BatchNormalization, Dropout, LSTM
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from keras import callbacks

np.random.seed(0)

2024-06-02 19:15:04.714555: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-02 19:15:04.714704: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-02 19:15:04.879440: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [26]:
#Early stopping
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

# Initialising the NN
model = Sequential()

# layers

model.add(Dense(units = 32, kernel_initializer = 'uniform', activation = 'relu', input_dim = 26))
model.add(Dense(units = 32, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 16, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
opt = Adam(learning_rate=0.00009)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

# Train the ANN
history = model.fit(X_train, y_train, batch_size = 32, epochs = 150, callbacks=[early_stopping], validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: could not convert string to float: 'minus'