In [2]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import os

# Ensure NLTK stopwords are downloaded (using bypass if needed)
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [6]:
raw_dir = os.path.join(os.getcwd(), '..', 'data', 'raw')
normal_file = os.path.join(raw_dir, 'normal_trace.csv')
failure_file = os.path.join(raw_dir, 'failure_trace.csv')

In [7]:
# =======================================================
# Option 1: Work with a Subset of the Data (nrows parameter)
# =======================================================
print("Loading a small subset (1000 rows each) for testing...")


Loading a small subset (1000 rows each) for testing...


In [8]:
normal_df_subset = pd.read_csv(normal_file, nrows=1000)
failure_df_subset = pd.read_csv(failure_file, nrows=1000)

In [9]:
normal_df_subset['log_type'] = 'normal'
failure_df_subset['log_type'] = 'failure'


In [10]:
combined_subset = pd.concat([normal_df_subset, failure_df_subset], ignore_index=True)
print("Subset combined shape:", combined_subset.shape)
display(combined_subset.head())

Subset combined shape: (2000, 2157)


Unnamed: 0,TaskID,getFileInfo+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.hdfsfilestatus,RPC:getFileInfo+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.hdfsfilestatus,getBlockLocations+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.locatedblocks,RPC:getBlockLocations+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.locatedblocks,bestNode+success: chosen bestnode = in nodes =,chooseDataNode+success: chosennode = /,OP: new blockSender+success: return(op_status_success),newBlockReader+success: return a valid blockreader,OP: try new BlockReader+success: chosennode =,...,blockSeekTo+ioexception: java.io.ioexception: could not obtain block: blk_file=/user/hadoop/dir_client/localfile_blocks_clientorg.apache.hadoop.hdfs.protocol.locatedblock@afdbof /user/hadoop/dir_client,blockSeekTo+ioexception: java.io.ioexception: could not obtain block: blk_-file=/user/hadoop/dir_client/localfile_blocks_clientorg.apache.hadoop.hdfs.protocol.locatedblock@fa of /user/hadoop/dir_client,blockSeekTo+ioexception: java.io.ioexception: could not obtain block: blk_file=/user/hadoop/dir_client/localfile_blocks_clientorg.apache.hadoop.hdfs.protocol.locatedblock@dcof /user/hadoop/dir_client,Exception+interruptedexception:java.lang.interruptedexception,blockSeekTo+success: chosennode = for blk_of /user/hadoop/localfile_blocks_client,blockSeekTo+success: chosennode = for blk_-of /user/hadoop/localfile_blocks_client,addBlock+fail: org.apache.hadoop.hdfs.server.namenode.notreplicatedyetexception: not replicated yet:/user/hadoop/localfile_blocks_client,RPC:addBlock+exception: remoteexception(org.apache.hadoop.ipc.remoteexception: org.apache.hadoop.hdfs.server.namenode.notreplicatedyetexception: not replicated yet:/user/hadoop/localfile_blocks_client,writeBlock+exception: org.apache.hadoop.util.shell$exitcodeexception: du: cannot access `/home/hadoop/data/dfs.data.dir/current/blk_meta\': no such file or directory,log_type
0,C47C9A2D664ACF66,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
1,2525398BEF2D756B,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
2,8F758E6FAF80F711,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
3,A44931922B99D07C,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
4,5B7F82CA0EEB7A1A,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [11]:
# =======================================================
# Option 2: Process Data in Chunks (chunksize parameter)
# =======================================================
print("\nProcessing the full dataset in chunks (chunk size = 1000)...")

chunk_size = 1000



Processing the full dataset in chunks (chunk size = 1000)...


In [12]:
# Initialize lists to hold chunks for normal and failure logs
normal_chunks = []
failure_chunks = []

In [13]:
# Process normal_trace.csv in chunks
for chunk in pd.read_csv(normal_file, chunksize=chunk_size):
    chunk['log_type'] = 'normal'
    normal_chunks.append(chunk)
    
# Process failure_trace.csv in chunks
for chunk in pd.read_csv(failure_file, chunksize=chunk_size):
    chunk['log_type'] = 'failure'
    failure_chunks.append(chunk)


In [14]:
# Concatenate all chunks for each log type
normal_df_full = pd.concat(normal_chunks, ignore_index=True)
failure_df_full = pd.concat(failure_chunks, ignore_index=True)

# Combine both full datasets
combined_full = pd.concat([normal_df_full, failure_df_full], ignore_index=True)
print("Full combined shape using chunks:", combined_full.shape)
display(combined_full.head())

Full combined shape using chunks: (256584, 2157)


Unnamed: 0,TaskID,getFileInfo+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.hdfsfilestatus,RPC:getFileInfo+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.hdfsfilestatus,getBlockLocations+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.locatedblocks,RPC:getBlockLocations+success: return(ow[class=class org.apache.hadoop.hdfs.protocol.locatedblocks,bestNode+success: chosen bestnode = in nodes =,chooseDataNode+success: chosennode = /,OP: new blockSender+success: return(op_status_success),newBlockReader+success: return a valid blockreader,OP: try new BlockReader+success: chosennode =,...,blockSeekTo+ioexception: java.io.ioexception: could not obtain block: blk_file=/user/hadoop/dir_client/localfile_blocks_clientorg.apache.hadoop.hdfs.protocol.locatedblock@afdbof /user/hadoop/dir_client,blockSeekTo+ioexception: java.io.ioexception: could not obtain block: blk_-file=/user/hadoop/dir_client/localfile_blocks_clientorg.apache.hadoop.hdfs.protocol.locatedblock@fa of /user/hadoop/dir_client,blockSeekTo+ioexception: java.io.ioexception: could not obtain block: blk_file=/user/hadoop/dir_client/localfile_blocks_clientorg.apache.hadoop.hdfs.protocol.locatedblock@dcof /user/hadoop/dir_client,Exception+interruptedexception:java.lang.interruptedexception,blockSeekTo+success: chosennode = for blk_of /user/hadoop/localfile_blocks_client,blockSeekTo+success: chosennode = for blk_-of /user/hadoop/localfile_blocks_client,addBlock+fail: org.apache.hadoop.hdfs.server.namenode.notreplicatedyetexception: not replicated yet:/user/hadoop/localfile_blocks_client,RPC:addBlock+exception: remoteexception(org.apache.hadoop.ipc.remoteexception: org.apache.hadoop.hdfs.server.namenode.notreplicatedyetexception: not replicated yet:/user/hadoop/localfile_blocks_client,writeBlock+exception: org.apache.hadoop.util.shell$exitcodeexception: du: cannot access `/home/hadoop/data/dfs.data.dir/current/blk_meta\': no such file or directory,log_type
0,C47C9A2D664ACF66,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
1,2525398BEF2D756B,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
2,8F758E6FAF80F711,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
3,A44931922B99D07C,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
4,5B7F82CA0EEB7A1A,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
