In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import traceback as tb

In [2]:
df = pd.read_csv('train.dat', delimiter='\t', header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,-1,DVELDLVEISPNALP
1,-1,KADEELFNKLFFGT
2,-1,FLVALHLGTAFALLWYFRKRWCALVRGFFASFGGRRNDDAHMM
3,-1,RDQMRARIADITGVAISRIA
4,-1,RKRLQLLLL


In [4]:
df_results = df[0]

In [56]:
df_results.columns = ['Labels']

In [5]:
df_results

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
1561   -1
1562    1
1563    1
1564   -1
1565   -1
Name: 0, Length: 1566, dtype: int64

In [6]:
df_seq = df[[1]]

In [7]:
df_seq.columns = ['Sequence']

In [8]:
df_seq.head()

Unnamed: 0,Sequence
0,DVELDLVEISPNALP
1,KADEELFNKLFFGT
2,FLVALHLGTAFALLWYFRKRWCALVRGFFASFGGRRNDDAHMM
3,RDQMRARIADITGVAISRIA
4,RKRLQLLLL


In [9]:
def dna_k_mers_generation(dna_sequence, k_mers_length):
    k_mers_list = []
    try:
        for i in range(len(dna_sequence) - k_mers_length + 1):
            k_mer_substring = dna_sequence[i : i + k_mers_length]
            k_mers_list.append(k_mer_substring)
    except Exception as e:
        tb.print_exc()
    return k_mers_list

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6))
X = vectorizer.fit_transform(df_seq['Sequence']).toarray()

In [12]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [52]:
# k_mers_list = []# Range of k-mer lengths (e.g., 2 to 5)
# for k in range(2, 6):
#     df_seq[f'kmers_{k}'] = df_seq['Sequence'].apply(lambda x: dna_k_mers_generation(x, k))

In [13]:
df_seq.head()

Unnamed: 0,Sequence
0,DVELDLVEISPNALP
1,KADEELFNKLFFGT
2,FLVALHLGTAFALLWYFRKRWCALVRGFFASFGGRRNDDAHMM
3,RDQMRARIADITGVAISRIA
4,RKRLQLLLL


In [14]:
from imblearn.over_sampling import SMOTE

y = df_results

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [86]:
# Check the balance of the classes
# print(pd.Series(y_resampled).value_counts()).to_clipboard()

In [15]:
df_results.head()

0   -1
1   -1
2   -1
3   -1
4   -1
Name: 0, dtype: int64

In [16]:
df_results.value_counts()

0
-1    1424
 1     142
Name: count, dtype: int64

In [17]:
df_resampled = pd.DataFrame(X_resampled)
# df_resampled.to_clipboard(index=False)

In [18]:
df_resampled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92492,92493,92494,92495,92496,92497,92498,92499,92500,92501
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
df_resampled.to_clipboard()

In [19]:
df_resampled.to_csv('features_resampled.csv')
features_resampled = pd.DataFrame(y_resampled)
features_resampled.to_csv('labels_resampled.csv')

In [20]:
df_resampled.shape

(2848, 92502)

In [21]:
y_resampled.shape

(2848,)

In [26]:
features_resampled.columns = ['Labels']

In [27]:
## Validation test creation
combined_df_resampled = pd.concat([df_resampled, features_resampled], axis= 1)
combined_df_resampled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92493,92494,92495,92496,92497,92498,92499,92500,92501,Labels
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1


In [28]:
from sklearn.model_selection import train_test_split

df_resampled_train, df_resampled_val, label_resampled_train, label_resampled_val = train_test_split(df_resampled, features_resampled, test_size= 0.2, random_state=42)

In [29]:
print(df_resampled_train.shape)
print(df_resampled_val.shape)
print(label_resampled_train.shape)
print(label_resampled_val.shape)

(2278, 92502)
(570, 92502)
(2278, 1)
(570, 1)


In [33]:
label_resampled_train

Unnamed: 0,Labels
942,-1
598,-1
2674,1
2210,1
789,1
...,...
1638,1
1095,-1
1130,-1
1294,-1


In [64]:
import numpy as np
import scipy.special

class NeuralNetwork:
    
    def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
        # Set number of nodes in each input, hidden, output layer
        self.inodes = inputnodes
        self.hnodes = hiddennodes
        self.onodes = outputnodes

        # Initialize weights with a normal distribution
        self.wih = np.random.normal(0.0, pow(self.inodes, -0.5), (self.hnodes, self.inodes))
        self.who = np.random.normal(0.0, pow(self.hnodes, -0.5), (self.onodes, self.hnodes))

        # Learning rate
        self.lr = learningrate
        
        # Activation functions
        self.activation_function_hidden = lambda x: np.maximum(0, x)  # ReLU for hidden layers
        self.activation_function_output = lambda x: scipy.special.expit(x)  # Sigmoid for output layer

    def train(self, inputs_list, targets_list):
        # Convert inputs list to 2d array
        inputs = np.array(inputs_list, ndmin=2).T
        targets = np.array(targets_list, ndmin=2).T

        # Forward pass
        hidden_inputs = np.dot(self.wih, inputs)
        hidden_outputs = self.activation_function_hidden(hidden_inputs)
        
        final_inputs = np.dot(self.who, hidden_outputs)
        final_outputs = self.activation_function_output(final_inputs)

        # Error calculation
        output_errors = targets - final_outputs
        hidden_errors = np.dot(self.who.T, output_errors)

        # Backpropagation
        self.who += self.lr * np.dot((output_errors * final_outputs * (1.0 - final_outputs)), np.transpose(hidden_outputs))
        
        # Update hidden layer weights considering ReLU derivative
        self.wih += self.lr * np.dot((hidden_errors * (hidden_outputs > 0).astype(int)), np.transpose(inputs))

    def query(self, inputs_list):
        # Convert inputs list to 2d array
        inputs = np.array(inputs_list, ndmin=2).T

        # Forward pass
        hidden_inputs = np.dot(self.wih, inputs)
        hidden_outputs = self.activation_function_hidden(hidden_inputs)
        
        final_inputs = np.dot(self.who, hidden_outputs)
        final_outputs = self.activation_function_output(final_inputs)
        
        return final_outputs

# Example: Creating a Neural Network
input_nodes = 92502
hidden_nodes = 256   # Adjust as needed
output_nodes = 1
learning_rate = 0.08

nn = NeuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)


In [65]:
epochs = 3

for i in range(epochs):
    for i in range(len(df_resampled_train)):
        inputs = df_resampled_train.iloc[i].values
        targets = label_resampled_train.iloc[i].values
        nn.train(inputs, targets)



In [66]:
label_resampled_train.iloc[0,0]

-1

In [67]:
def calculate_mcc(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == -1) & (y_pred == -1))
    fp = np.sum((y_true == -1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == -1))

    numerator = (tp * tn) - (fp * fn)
    denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return numerator / (denominator + np.finfo(float).eps)

In [68]:
def evaluate_model(nn, X_val, y_val):
    predictions = []

    for i in range(len(X_val)):
        inputs = X_val.iloc[i].values  # Convert DataFrame row to NumPy array
        output = nn.query(inputs)
        predicted_label = 1 if output[0] >= 0.5 else -1
        predictions.append(predicted_label)

    predictions = np.array(predictions)

    if isinstance(y_val, (pd.DataFrame, pd.Series)):
        y_val = y_val.values.flatten()
    mcc_score = calculate_mcc(y_val, predictions)
    return mcc_score

# Calculate MCC for the validation set
mcc_validation = evaluate_model(nn, df_resampled_val, label_resampled_val)
print(f"MCC on Validation Set: {mcc_validation}")

MCC on Validation Set: 0.0
