In [17]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 

In [24]:
def read_and_process_log_file(file_path):
    try:
        with open(file_path, "r") as file:
            lines = file.readlines()
    except FileNotFoundError:
        print("Error: Could not open file")
        return None, None

    dataset_size = len(lines)
    print(f"Number of lines: {dataset_size}")

    # Initialize input and target arrays
    input_data = np.zeros((dataset_size, 4))  # INPUT_SIZE is 4 in this case
    target_data = np.zeros(dataset_size)

    read_counter = 1
    write_counter = 1

    for i, line in enumerate(tqdm(lines, desc="Processing Lines")):
        target_data[i] = 0
        parts = line.split()
        a, b = float(parts[0]), float(parts[1])
        readWrite = parts[2]
        input_data[i][0], input_data[i][1] = float(parts[3]), float(parts[4])
        input_data[i][3] = 0  # Initialize to 0
        

        # Check for continuity in input_data
        if i != 0 and input_data[i][0] == input_data[i - 1][0] + input_data[i - 1][1]:
            input_data[i][3] = 1

        if readWrite == 'r':
            read_counter += 1
        else:
            write_counter += 1

        input_data[i][2] = read_counter / write_counter
        # Target data calculation based on previous occurrences
        for target_lb_counter in range(i - 1, -1, -1):
            if input_data[i][0] == input_data[target_lb_counter][0]:
                target_difference = i - target_lb_counter
                if target_difference < 1000:
                    target_data[i] = 1
                elif target_difference < 10000:
                    target_data[i] = 2
                elif target_difference < 100000:
                    target_data[i] = 3
                else:
                    target_data[i] = 4
                break
    return input_data, target_data

In [None]:
file_path = "Data/NewData/FIO_test.log"
input_data, target_data = read_and_process_log_file(file_path)

Number of lines: 1571707


Processing Lines:   0%|          | 0/1571707 [00:00<?, ?it/s]

In [None]:
# Convert input_data and target_data to Pandas DataFrame and Series
def convert_to_pandas(input_data, target_data):
    input_df = pd.DataFrame(input_data, columns=['feature1', 'feature2', 'read_write_ratio', 'continuity'])
    target_df = pd.Series(target_data, name='target')
    return input_df, target_df

input_df, target_df = convert_to_pandas(input_data, target_data)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense

# Step 1: Convert to Pandas DataFrame and Series
input_df, target_df = convert_to_pandas(input_data, target_data)

# Step 2: Prepare the data (train-test split)
X_train, X_test, y_train, y_test = train_test_split(input_df, target_df, test_size=0.2, random_state=42)

# Convert to NumPy arrays for TensorFlow compatibility
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Reshape data to fit the GRU input format (samples, timesteps, features)
# In this case, we can consider each sample as a timestep=1
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Step 3: Build the GRU model
model = Sequential()
model.add(GRU(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
model.add(Dense(1, activation='linear'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Step 4: Train the GRU model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Step 5: Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Step 6: Make predictions (optional)
predictions = model.predict(X_test)