# Step 2: Data Preprocessing

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
current_dir = os.getcwd()
dir_contents = os.listdir(current_dir)

for filename in dir_contents:
    file_path = os.path.join(current_dir, filename)
    if os.path.isfile(file_path):
        print(filename)

creating_dataframe.ipynb
data_analysis.ipynb
data_preprocessing.ipynb
dt_and_rf_from_scratch.ipynb
helper_functions.py
sklearn_dt_and_rf.ipynb


In [3]:
df = pd.read_csv('../dataset/Q1_2019.csv')

In [4]:
df.shape

(9577046, 129)

In [5]:
df.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,,,91.0,...,,,,,,,,,,
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,,,98.0,...,,,,,,,,,,
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,,,99.0,...,,,,,,,,,,
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,,,93.0,...,,,,,,,,,,
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,134.0,103.0,100.0,...,,,,,,,,,,


In [6]:
df.tail()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
9577041,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,134.0,100.0,100.0,...,,,,,,,,,,
9577042,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,,,94.0,...,,,,,,,,,,
9577043,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,,,97.0,...,,,,,,,,,,
9577044,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,134.0,101.0,148.0,...,,,,,,,,,,
9577045,2019-03-31,PL2331LAHDS4TJ,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,133.0,104.0,100.0,...,,,,,,,,,,


### Checking if there is more than one unique hard drive recorded in a single day
### Checking the number of unique hard drives on 1st Jan 2019

In [7]:
selected_data = df[df['date'] == '2019-01-01']

In [8]:
drive_counts = selected_data['serial_number'].value_counts()

In [9]:
drive_counts

serial_number
Z305B2QN           1
WD-WXJ1A75F6SXU    1
ZCH077L7           1
ZCH07HQX           1
ZCH077L1           1
                  ..
PL1331LAHD3XAH     1
S301GMGS           1
S301GMGT           1
PL2331LAHDRZ3J     1
Z30271GD           1
Name: count, Length: 106918, dtype: int64

In [10]:
counts_array = drive_counts.values
counts_array

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

### Check if any hard drive count is not equal to 1

In [11]:
if (counts_array != 1).any():
    print("There are counts not equal to 1.")
else:
    print("All counts are equal to 1."
          "\nThis means that each hard drive has only been tested once per day")

All counts are equal to 1.
This means that each hard drive has only been tested once per day


### Analysing hard drive failures

In [12]:
df['failure'].value_counts()

failure
0    9576618
1        428
Name: count, dtype: int64

In [13]:
# Filter the dataframe to include only records where 'failure' is 1
failed_hard_drives = df[df['failure'] == 1]

total_unique_hard_drives = df['serial_number'].nunique()
total_unique_hard_drives

# Get the unique serial numbers of the failed hard drives
number_of_failed_hard_drives = failed_hard_drives['serial_number'].nunique()
number_of_failed_hard_drives

percentage_of_failures = (number_of_failed_hard_drives / total_unique_hard_drives) * 100

print(f'Total unique hard drives: {total_unique_hard_drives}')
print(f'Number of failed hard drives: {number_of_failed_hard_drives}')
print(f'Percentage of unique hard drives that have failed: {percentage_of_failures:.2f}%')

Total unique hard drives: 115231
Number of failed hard drives: 428
Percentage of unique hard drives that have failed: 0.37%


### Handling missing values

In [14]:
total_cells = np.prod(df.shape)
# The : in the f-string formats the number with commas as thousands separators
total_cells = f'{total_cells:,}'
print(f'The total number of cells are {total_cells}')

The total number of cells are 1,235,438,934


In [15]:
# Checks the number of missing (null) values in each column
missing_values_count = df.isna().sum()
missing_values_count[0:10]

date                        0
serial_number               0
model                       0
capacity_bytes              0
failure                     0
smart_1_normalized       1534
smart_1_raw              1534
smart_2_normalized    7457196
smart_2_raw           7457196
smart_3_normalized       3363
dtype: int64

In [16]:
def check_missing(df):
    """
    Checks how much missing data there is from a DataFrame.
    """
    total_missing = f'{df.isna().sum().sum():,}'
    print(f'The total number of missing cells are {total_missing}')
    
    total_cells = np.prod(df.shape)
    total_missing = df.isna().sum().sum()
    percent_missing = total_missing / total_cells * 100
    rounded_percent_missing = round(percent_missing, 2)
    print(f'{rounded_percent_missing}% of the data is missing')

In [17]:
check_missing(df)

The total number of missing cells are 765,053,890
61.93% of the data is missing


### Remove columns that have all null values

In [18]:
columns_before = df.columns
print(f'Initial number of columns of dataframe: {df.shape[1]}\n')

df = df.dropna(axis=1, how='all')

columns_after = df.columns
print(f'Number of columns of dataframe after removing columns consisting of entirely null values: {df.shape[1]}\n')

removed_columns = set(columns_before) - set(columns_after)
print(f'Columns removed: {removed_columns}')

Initial number of columns of dataframe: 129

Number of columns of dataframe after removing columns consisting of entirely null values: 115

Columns removed: {'smart_13_raw', 'smart_15_raw', 'smart_181_raw', 'smart_201_normalized', 'smart_182_raw', 'smart_179_raw', 'smart_182_normalized', 'smart_179_normalized', 'smart_15_normalized', 'smart_13_normalized', 'smart_255_raw', 'smart_181_normalized', 'smart_201_raw', 'smart_255_normalized'}


In [19]:
check_missing(df)

The total number of missing cells are 630,975,246
57.29% of the data is missing


In [20]:
df.shape

(9577046, 115)

### Only keep columns where the proportion of missing values is less than or equal to 5% of the column data

In [21]:
df.isna().sum()

date                          0
serial_number                 0
model                         0
capacity_bytes                0
failure                       0
                         ...   
smart_251_raw           9576961
smart_252_normalized    9576961
smart_252_raw           9576961
smart_254_normalized    9561418
smart_254_raw           9561418
Length: 115, dtype: int64

In [22]:
df.isna().sum().values

array([      0,       0,       0,       0,       0,    1534,    1534,
       7457196, 7457196,    3363,    3363,    3363,    3363,    3363,
          3363,    3363,    3363, 7457196, 7457196,    1534,    1534,
          3363,    3363, 9467408, 9467408,    1534,    1534, 9575217,
       9575217, 9575217, 9575217, 9208793, 9208793, 9467668, 9467668,
       9467668, 9467668, 9575217, 9575217, 9575217, 9575217, 9575217,
       9575217, 9575217, 9575217, 9575217, 9575217, 7446707, 7446707,
       5136912, 5136912, 2182118, 2182118, 2182118, 2182118, 5136912,
       5136912, 2182118, 2182118, 4866014, 4866014,    1590,    1590,
         54152,   54152,    1534,    1534, 4140124, 4140124, 7398291,
       7398291,    3363,    3363,    3363,    3363,    3363,    3363,
       6404059, 6404059, 9575217, 9575217, 9368731, 9368731, 9368731,
       9368731, 9303958, 9303958, 9368731, 9368731, 9526313, 9526313,
       9368731, 9368731, 9575217, 9575217, 9575217, 9575217, 9575217,
       9575217, 9575

In [23]:
len(df)

9577046

In [24]:
# Gets the columns where the proportion of missing values is less than or equal to 5%
df = df[df.columns[(df.isna().sum().values/len(df) <= 0.05)]]

In [25]:
df.shape

(9577046, 33)

In [26]:
check_missing(df)

The total number of missing cells are 177,564
0.06% of the data is missing


In [27]:
df.shape

(9577046, 33)

In [28]:
df.isna().sum()

date                        0
serial_number               0
model                       0
capacity_bytes              0
failure                     0
smart_1_normalized       1534
smart_1_raw              1534
smart_3_normalized       3363
smart_3_raw              3363
smart_4_normalized       3363
smart_4_raw              3363
smart_5_normalized       3363
smart_5_raw              3363
smart_7_normalized       3363
smart_7_raw              3363
smart_9_normalized       1534
smart_9_raw              1534
smart_10_normalized      3363
smart_10_raw             3363
smart_12_normalized      1534
smart_12_raw             1534
smart_192_normalized     1590
smart_192_raw            1590
smart_193_normalized    54152
smart_193_raw           54152
smart_194_normalized     1534
smart_194_raw            1534
smart_197_normalized     3363
smart_197_raw            3363
smart_198_normalized     3363
smart_198_raw            3363
smart_199_normalized     3363
smart_199_raw            3363
dtype: int

### Remove rows that still have any missing values from the remaining columns

In [29]:
# Remove rows that still have any missing values in those selected columns
df = df.dropna()

In [30]:
df.shape

(9522894, 33)

In [31]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,83.0,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,99.0,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,100.0,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,100.0,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,100.0,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9577041,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,100.0,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
9577042,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,100.0,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
9577043,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,100.0,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0
9577044,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,100.0,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0


### After dropping some rows there are 9,522,894 rows in the dataframe but the last row is at index 9,577,045. So that means the index may still contain the original values.

In [32]:
# Reset the index and drop the old index
df = df.reset_index(drop=True)

In [33]:
df.shape

(9522894, 33)

In [34]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,83.0,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,99.0,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,100.0,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,100.0,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,100.0,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522889,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,100.0,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
9522890,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,100.0,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
9522891,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,100.0,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0
9522892,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,100.0,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0


In [35]:
check_missing(df)

The total number of missing cells are 0
0.0% of the data is missing


In [36]:
df.isna().sum()

date                    0
serial_number           0
model                   0
capacity_bytes          0
failure                 0
smart_1_normalized      0
smart_1_raw             0
smart_3_normalized      0
smart_3_raw             0
smart_4_normalized      0
smart_4_raw             0
smart_5_normalized      0
smart_5_raw             0
smart_7_normalized      0
smart_7_raw             0
smart_9_normalized      0
smart_9_raw             0
smart_10_normalized     0
smart_10_raw            0
smart_12_normalized     0
smart_12_raw            0
smart_192_normalized    0
smart_192_raw           0
smart_193_normalized    0
smart_193_raw           0
smart_194_normalized    0
smart_194_raw           0
smart_197_normalized    0
smart_197_raw           0
smart_198_normalized    0
smart_198_raw           0
smart_199_normalized    0
smart_199_raw           0
dtype: int64

In [28]:
df.to_csv('../dataset/preprocessed_q1_2019.csv', index=False)

In [79]:
df = pd.read_csv('../dataset/preprocessed_q1_2019.csv')

### Filtering the dataframe to only include drives that have eventually failed

In [37]:
# Getting the serial number of the hard drives that have eventually failed
failed_hdds = df.loc[df.failure==1]['serial_number']

In [38]:
len(failed_hdds)

418

In [39]:
# Filters the dataframe to include only rows where the serial number is in the failed hdds
df = df.loc[df['serial_number'].isin(failed_hdds)].reset_index(drop=True)

In [40]:
df.shape

(18588, 33)

In [41]:
df.to_csv('../dataset/failed_hdds.csv', index=False)

In [23]:
df = pd.read_csv('../dataset/failed_hdds.csv')

### Confirming a given hard drive will eventually fail

In [42]:
df.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
0,2019-01-01,Z304JN7J,ST4000DM000,4000787030016,0,119.0,208663312.0,92.0,0.0,100.0,...,97.0,7273.0,23.0,23.0,100.0,0.0,100.0,0.0,200.0,0.0
1,2019-01-01,ZCH05KWT,ST12000NM0007,12000138625024,0,79.0,155920208.0,98.0,0.0,100.0,...,100.0,633.0,40.0,40.0,100.0,88.0,100.0,88.0,200.0,0.0
2,2019-01-01,ZA16DSXV,ST8000NM0055,8001563222016,0,77.0,54823024.0,98.0,0.0,100.0,...,93.0,14936.0,29.0,29.0,100.0,0.0,100.0,0.0,200.0,0.0
3,2019-01-01,ZCH080T0,ST12000NM0007,12000138625024,0,84.0,235330528.0,94.0,0.0,100.0,...,100.0,1501.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
4,2019-01-01,ZA1890DY,ST8000NM0055,8001563222016,0,67.0,240427784.0,95.0,0.0,100.0,...,93.0,14353.0,38.0,38.0,100.0,0.0,100.0,0.0,200.0,0.0


In [43]:
df[df['serial_number'] == 'Z304JN7J']

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
0,2019-01-01,Z304JN7J,ST4000DM000,4000787030016,0,119.0,208663312.0,92.0,0.0,100.0,...,97.0,7273.0,23.0,23.0,100.0,0.0,100.0,0.0,200.0,0.0
403,2019-01-02,Z304JN7J,ST4000DM000,4000787030016,0,118.0,186070184.0,92.0,0.0,100.0,...,97.0,7273.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0
802,2019-01-03,Z304JN7J,ST4000DM000,4000787030016,0,118.0,178727040.0,92.0,0.0,100.0,...,97.0,7273.0,23.0,23.0,100.0,0.0,100.0,0.0,200.0,0.0
1196,2019-01-04,Z304JN7J,ST4000DM000,4000787030016,0,112.0,48739088.0,92.0,0.0,100.0,...,97.0,7273.0,24.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
1585,2019-01-05,Z304JN7J,ST4000DM000,4000787030016,0,114.0,59957400.0,92.0,0.0,100.0,...,97.0,7273.0,24.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
1973,2019-01-06,Z304JN7J,ST4000DM000,4000787030016,0,116.0,114838640.0,92.0,0.0,100.0,...,97.0,7273.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
2358,2019-01-07,Z304JN7J,ST4000DM000,4000787030016,0,120.0,2121928.0,92.0,0.0,100.0,...,97.0,7273.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
2741,2019-01-08,Z304JN7J,ST4000DM000,4000787030016,0,118.0,168567288.0,92.0,0.0,100.0,...,97.0,7273.0,23.0,23.0,100.0,0.0,100.0,0.0,200.0,0.0
3117,2019-01-09,Z304JN7J,ST4000DM000,4000787030016,0,116.0,106967816.0,92.0,0.0,100.0,...,97.0,7273.0,24.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
3486,2019-01-10,Z304JN7J,ST4000DM000,4000787030016,0,119.0,231100456.0,92.0,0.0,100.0,...,97.0,7273.0,24.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0


### Analysing failed hard drives

In [44]:
# Filter the dataframe to include only records where 'failure' is 1
failed_hard_drives = df[df['failure'] == 1]

total_unique_hard_drives = df['serial_number'].nunique()
total_unique_hard_drives

# Get the unique serial numbers of the failed hard drives
number_of_failed_hard_drives = failed_hard_drives['serial_number'].nunique()
number_of_failed_hard_drives

percentage_of_failures = (number_of_failed_hard_drives / total_unique_hard_drives) * 100

print(f'Total unique hard drives: {total_unique_hard_drives}')
print(f'Number of failed hard drives: {number_of_failed_hard_drives}')
print(f'Percentage of unique hard drives that have failed: {percentage_of_failures:.2f}%')

Total unique hard drives: 418
Number of failed hard drives: 418
Percentage of unique hard drives that have failed: 100.00%


### Finding the Remaining Useful Life (RUL)

In [45]:
# Calculates the maximum value (latest date) for each group (serial number)
df.loc[:, 'end_date'] = df.groupby('serial_number')['date'].transform('max')

In [46]:
df['end_date'].dtype

dtype('O')

In [47]:
df['date'] = pd.to_datetime(df['date'])
df['end_date'] = pd.to_datetime(df['end_date'])

In [48]:
df['date'].dtype

dtype('<M8[ns]')

In [49]:
df['end_date'].dtype

dtype('<M8[ns]')

In [50]:
df.loc[:, 'date_diff'] = df['end_date'] - df['date']
df.loc[:, 'date_diff'].describe()

count                         18588
mean     29 days 11:31:57.366042608
std      21 days 08:23:28.118660467
min                 0 days 00:00:00
25%                11 days 00:00:00
50%                26 days 00:00:00
75%                45 days 00:00:00
max                88 days 00:00:00
Name: date_diff, dtype: object

In [51]:
df['date_diff'].dtype

dtype('<m8[ns]')

In [52]:
# Convert 'date_diff' to integer representing the number of days
df['date_diff_days'] = df['date_diff'].dt.days

In [53]:
df['date_diff_days'].dtype

dtype('int64')

In [54]:
# axis=1 to drop columns
df = df.drop(['date', 'serial_number', 'model', 'end_date', 'date_diff'], axis=1)

In [55]:
df.head()

Unnamed: 0,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,smart_4_raw,smart_5_normalized,smart_5_raw,...,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw,date_diff_days
0,4000787030016,0,119.0,208663312.0,92.0,0.0,100.0,9.0,100.0,0.0,...,7273.0,23.0,23.0,100.0,0.0,100.0,0.0,200.0,0.0,26
1,12000138625024,0,79.0,155920208.0,98.0,0.0,100.0,2.0,100.0,24.0,...,633.0,40.0,40.0,100.0,88.0,100.0,88.0,200.0,0.0,26
2,8001563222016,0,77.0,54823024.0,98.0,0.0,100.0,2.0,100.0,0.0,...,14936.0,29.0,29.0,100.0,0.0,100.0,0.0,200.0,0.0,52
3,12000138625024,0,84.0,235330528.0,94.0,0.0,100.0,5.0,100.0,0.0,...,1501.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0,83
4,8001563222016,0,67.0,240427784.0,95.0,0.0,100.0,4.0,100.0,3896.0,...,14353.0,38.0,38.0,100.0,0.0,100.0,0.0,200.0,0.0,27


In [38]:
df.to_csv('../dataset/failed_hdds_with_rul.csv', index=False)