### Importing initial modules

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.26.1'

In [3]:
pd.__version__

'2.1.4'

In [4]:
os.getcwd()

'C:\\Users\\baari\\Documents\\uni\\y3\\project\\final_project\\hard-drive-predictive-maintenance'

### Changing to the correct directory if not already

In [5]:
path = '/Users/baari/Documents/uni/y3/project/final_project/hard-drive-predictive-maintenance'
os.chdir(path)
os.getcwd()

'C:\\Users\\baari\\Documents\\uni\\y3\\project\\final_project\\hard-drive-predictive-maintenance'

In [6]:
df = pd.read_csv('data/Q1_2019.csv')

In [7]:
df.shape

(9577046, 129)

In [8]:
df.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,,,91.0,...,,,,,,,,,,
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,,,98.0,...,,,,,,,,,,
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,,,99.0,...,,,,,,,,,,
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,,,93.0,...,,,,,,,,,,
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,134.0,103.0,100.0,...,,,,,,,,,,


In [9]:
df.tail()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
9577041,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,134.0,100.0,100.0,...,,,,,,,,,,
9577042,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,,,94.0,...,,,,,,,,,,
9577043,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,,,97.0,...,,,,,,,,,,
9577044,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,134.0,101.0,148.0,...,,,,,,,,,,
9577045,2019-03-31,PL2331LAHDS4TJ,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,133.0,104.0,100.0,...,,,,,,,,,,


In [10]:
df['failure'].value_counts()

failure
0    9576618
1        428
Name: count, dtype: int64

### Handling missing values

In [11]:
total_cells = np.prod(df.shape)
# The : in the f-string formats the number with commas as thousands separators
total_cells = f'{total_cells:,}'
print(f'The total number of cells are {total_cells}')

The total number of cells are 1,235,438,934


In [12]:
# Checks the number of missing (null) values in each column
missing_values_count = df.isnull().sum()
missing_values_count[0:10]

date                        0
serial_number               0
model                       0
capacity_bytes              0
failure                     0
smart_1_normalized       1534
smart_1_raw              1534
smart_2_normalized    7457196
smart_2_raw           7457196
smart_3_normalized       3363
dtype: int64

In [13]:
total_missing = missing_values_count.sum()
total_missing = f'{total_missing:,}'
print(f'The total number of missing cells are {total_missing}')

total_cells = np.prod(df.shape)
total_missing = missing_values_count.sum()
percent_missing = total_missing / total_cells * 100
print(f'{percent_missing}% of the data is missing')

The total number of missing cells are 765,053,890
61.925674264042584% of the data is missing


In [14]:
columns_before = df.columns
print(f'Initial number of columns of dataframe: {df.shape[1]}')

# Remove columns that have all null values
# df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.isnull().all()]

columns_after = df.columns
print(f'Number of columns of dataframe after removing columns consisting of entirely null values: {df.shape[1]}')

removed_columns = set(columns_before) - set(columns_after)
print(f'Columns removed: {removed_columns}')

Initial number of columns of dataframe: 129
Number of columns of dataframe after removing columns consisting of entirely null values: 115
Columns removed: {'smart_255_normalized', 'smart_182_raw', 'smart_15_raw', 'smart_201_normalized', 'smart_182_normalized', 'smart_179_normalized', 'smart_13_raw', 'smart_255_raw', 'smart_181_normalized', 'smart_15_normalized', 'smart_181_raw', 'smart_201_raw', 'smart_179_raw', 'smart_13_normalized'}


In [15]:
total_missing = f'{df.isnull().sum().sum():,}'
print(f'The total number of missing cells are {total_missing}')

total_cells = np.prod(df.shape)
total_missing = df.isnull().sum().sum()
percent_missing = total_missing / total_cells * 100
print(f'{percent_missing}% of the data is missing')

The total number of missing cells are 630,975,246
57.290538957056455% of the data is missing


In [16]:
df.shape

(9577046, 115)

In [17]:
len(df)

9577046

In [18]:
# Gets the columns where the proportion of missing values is less than or equal to 5%
df = df[df.columns[(df.isna().sum().values/len(df) <= 0.05)]]

In [19]:
total_missing = f'{df.isnull().sum().sum():,}'
print(f'The total number of missing cells are {total_missing}')

total_cells = np.prod(df.shape)
total_missing = df.isnull().sum().sum()
percent_missing = total_missing / total_cells * 100
print(f'{percent_missing}% of the data is missing')

The total number of missing cells are 177,564
0.056183579704297895% of the data is missing


In [20]:
# Remove rows that still have any missing values in those selected columns
df = df.dropna()
df.shape

(9522894, 33)

In [21]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,83.0,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,99.0,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,100.0,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,100.0,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,100.0,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9577041,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,100.0,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
9577042,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,100.0,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
9577043,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,100.0,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0
9577044,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,100.0,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0


### After dropping some rows there are 9,522,894 rows in the dataframe but the last row is at index 9,577,045. So that means the index may still contain the original values.

In [22]:
# Reset the index and drop the old index
df = df.reset_index(drop=True)

In [23]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,83.0,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,99.0,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,100.0,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,100.0,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,100.0,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522889,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,100.0,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0
9522890,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,100.0,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0
9522891,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,100.0,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0
9522892,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,100.0,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0


In [24]:
total_missing = f'{df.isnull().sum().sum():,}'
print(f'The total number of missing cells are {total_missing}')

total_cells = np.prod(df.shape)
total_missing = df.isnull().sum().sum()
percent_missing = total_missing / total_cells * 100
print(f'{percent_missing}% of the data is missing')

The total number of missing cells are 0
0.0% of the data is missing


In [25]:
df['date'].dtype

dtype('O')

In [26]:
df['date'] = pd.to_datetime(df['date'])

In [27]:
df['date'].dtype

dtype('<M8[ns]')

In [28]:
df.to_csv('data/cleaned_and_processed_q1_data.csv', index=False)

# Finding the Remaining Useful Life (RUL)

In [54]:
# Calculates the maximum value (latest date) for each group (serial number)
df["end_date"] = df.groupby("serial_number")["date"].transform("max")

In [55]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw,end_date
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522889,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
9522890,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
9522891,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
9522892,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31


In [56]:
df["end_date"] = pd.to_datetime(df["end_date"])
df["date"] = pd.to_datetime(df["date"])

In [57]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw,end_date
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522889,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
9522890,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
9522891,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31
9522892,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31


In [58]:
df["date_diff"] = df["end_date"] - df["date"]
df["date_diff"].describe()

count                       9522894
mean     43 days 08:10:36.970063933
std      26 days 00:13:31.700161596
min                 0 days 00:00:00
25%                21 days 00:00:00
50%                43 days 00:00:00
75%                66 days 00:00:00
max                89 days 00:00:00
Name: date_diff, dtype: object

In [59]:
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw,end_date,date_diff
0,2019-01-01,Z305B2QN,ST4000DM000,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,...,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,89 days
1,2019-01-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,...,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,89 days
2,2019-01-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,...,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,89 days
3,2019-01-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,...,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,89 days
4,2019-01-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,100.0,436.0,100.0,...,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,89 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522889,2019-03-31,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,100.0,0.0,100.0,...,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,0 days
9522890,2019-03-31,ZA10MCEQ,ST8000DM002,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,...,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,0 days
9522891,2019-03-31,ZCH0CRTK,ST12000NM0007,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,...,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,0 days
9522892,2019-03-31,PL1331LAHD1T5H,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,148.0,490.0,100.0,...,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0,2019-03-31,0 days


In [63]:
df['date'].dtype

dtype('<M8[ns]')

In [64]:
df['end_date'].dtype

dtype('<M8[ns]')

In [62]:
df['date_diff'].dtype

dtype('<m8[ns]')

In [61]:
df['date_diff'].dt.days

0          89
1          89
2          89
3          89
4          89
           ..
9522889     0
9522890     0
9522891     0
9522892     0
9522893     0
Name: date_diff, Length: 9522894, dtype: int64

## Dropping unwanted columns

In [31]:
# axis=1 to drop columns
df = df.drop(['date', 'serial_number', 'model', 'end_date'], axis=1)
df

Unnamed: 0,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,smart_4_raw,smart_5_normalized,smart_5_raw,...,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw,date_diff
0,4000787030016,0,111.0,35673128.0,91.0,0.0,100.0,12.0,100.0,0.0,...,34169.0,20.0,20.0,100.0,0.0,100.0,0.0,200.0,0.0,89 days
1,12000138625024,0,83.0,187116872.0,98.0,0.0,100.0,2.0,100.0,0.0,...,2145.0,28.0,28.0,100.0,0.0,100.0,0.0,200.0,0.0,89 days
2,12000138625024,0,73.0,19599104.0,99.0,0.0,100.0,1.0,100.0,0.0,...,363.0,34.0,34.0,100.0,0.0,100.0,0.0,200.0,0.0,89 days
3,12000138625024,0,81.0,136943696.0,93.0,0.0,100.0,6.0,100.0,0.0,...,687.0,22.0,22.0,100.0,0.0,100.0,0.0,200.0,0.0,89 days
4,4000787030016,0,100.0,0.0,100.0,436.0,100.0,8.0,100.0,0.0,...,183.0,193.0,31.0,100.0,0.0,100.0,0.0,200.0,0.0,89 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522889,4000787030016,0,100.0,0.0,100.0,0.0,100.0,3.0,100.0,0.0,...,3.0,250.0,24.0,100.0,0.0,100.0,0.0,200.0,0.0,0 days
9522890,8001563222016,0,72.0,15233376.0,94.0,0.0,100.0,3.0,100.0,0.0,...,152.0,25.0,25.0,100.0,0.0,100.0,0.0,200.0,0.0,0 days
9522891,12000138625024,0,81.0,122099464.0,97.0,0.0,100.0,3.0,100.0,0.0,...,677.0,26.0,26.0,100.0,0.0,100.0,0.0,200.0,0.0,0 days
9522892,4000787030016,0,100.0,0.0,148.0,490.0,100.0,18.0,100.0,0.0,...,303.0,181.0,33.0,100.0,0.0,100.0,0.0,200.0,0.0,0 days


In [32]:
df.columns

Index(['capacity_bytes', 'failure', 'smart_1_normalized', 'smart_1_raw',
       'smart_3_normalized', 'smart_3_raw', 'smart_4_normalized',
       'smart_4_raw', 'smart_5_normalized', 'smart_5_raw',
       'smart_7_normalized', 'smart_7_raw', 'smart_9_normalized',
       'smart_9_raw', 'smart_10_normalized', 'smart_10_raw',
       'smart_12_normalized', 'smart_12_raw', 'smart_192_normalized',
       'smart_192_raw', 'smart_193_normalized', 'smart_193_raw',
       'smart_194_normalized', 'smart_194_raw', 'smart_197_normalized',
       'smart_197_raw', 'smart_198_normalized', 'smart_198_raw',
       'smart_199_normalized', 'smart_199_raw', 'date_diff'],
      dtype='object')

# Emulate BackBlaze's method to calculate the Annual Failure Rate (AFR) from their .sql files

# Computing failure rates

### Creating a dataframe that has the number of drive days for each model. Drive days refers to the number of days a hard drive has been running (the number of rows in the main dataframe for that model).

### Written in SQL like this:

CREATE TABLE drive_days AS 
    SELECT model, count(*) AS drive_days 
    FROM drive_stats 
    GROUP BY model;

In [53]:
# Groups the dataframe by the 'model' column and calculates the size (number of rows) for each group
# The .size() gets the count of occurrences for each model
drive_days = df.groupby('model').size().reset_index(name='drive_days')

# Sort the dataframe by 'drive_days' in descending order
drive_days = drive_days.sort_values(by='drive_days', ascending=False).reset_index(drop=True)

drive_days

Unnamed: 0,model,drive_days
0,ST12000NM0007,2954794
1,ST4000DM000,1989331
2,ST8000NM0055,1294339
3,HGST HMS5C4040BLE640,1172724
4,ST8000DM002,888712
5,HGST HMS5C4040ALE640,313365
6,HGST HUH721212ALN604,259370
7,ST6000DX000,135832
8,TOSHIBA MG07ACA14TA,109378
9,ST10000NM0086,108555


In [54]:
# Total drive days across all models (which is also the total rows)
total_drive_days = len(df)
print(f'Total drive days across all models: {total_drive_days}')

Total drive days across all models: 9522894


### Creating a table that has the number of failures for each model.

### Written in SQL like this:

CREATE TABLE failures AS
    SELECT model, count(*) AS failures
    FROM drive_stats
    WHERE failure = 1
    GROUP BY model;

In [55]:
failures = df[df['failure'] == 1].groupby('model').size().reset_index(name='failures')
failures = failures.sort_values(by='failures', ascending=False).reset_index(drop=True)
failures

Unnamed: 0,model,failures
0,ST12000NM0007,174
1,ST4000DM000,105
2,ST8000NM0055,57
3,ST8000DM002,27
4,TOSHIBA MQ01ABF050,13
5,HGST HMS5C4040BLE640,11
6,ST500LM030,8
7,HGST HUH721212ALN604,4
8,TOSHIBA MQ01ABF050M,3
9,HGST HUH728080ALE600,3


In [56]:
# Calculating the total failures
total_failures = (df['failure'] == 1).sum()
print(f'Total failures: {total_failures}')

# Calculating the total unique days
total_unique_days = df['date'].nunique()
print(f'Total unique days: {total_unique_days}')

print(f'Total drive days: {total_drive_days}')

Total failures: 418
Total unique days: 90
Total drive days: 9522894


### So we have 428 drive failures in 9,577,046 drive days of operation.
### The daily failure rate is the drive failures / drive days.
### The annual failure rate would be the daily failure rate * 365 (assuming the rest of the year would have similar results to the first 3 months)

In [57]:
daily_failure_rate = (total_failures / total_drive_days) * 100
annual_failure_rate = daily_failure_rate * 365
print(f'Daily failure rate is {daily_failure_rate}%')
print(f'Annual failure rate is {annual_failure_rate}%')

Daily failure rate is 0.004389421955132547%
Annual failure rate is 1.6021390136233797%


### Creating a table that has the number of drives for each model as of January 31st 2019

### Written in SQL like this:

CREATE TABLE model_count AS
    SELECT model, count(*) AS count
    FROM drive_stats
    WHERE date = '2019-01-31'
    GROUP BY model;

In [58]:
# Convert 'date' column to datetime type (if not already)
df['date'] = pd.to_datetime(df['date'])

# Filter rows for specific date and calculate the number of drives for each model
model_count = df[df['date'] == '2019-01-31'].groupby('model').size().reset_index(name='count')

# Sort the dataframe by 'count' in descending order
model_count = model_count.sort_values(by='count', ascending=False).reset_index(drop=True)
model_count

Unnamed: 0,model,count
0,ST12000NM0007,32245
1,ST4000DM000,22854
2,ST8000NM0055,14383
3,HGST HMS5C4040BLE640,12880
4,ST8000DM002,9875
5,HGST HMS5C4040ALE640,3653
6,HGST HUH721212ALN604,2475
7,ST6000DX000,1524
8,TOSHIBA MG07ACA14TA,1220
9,ST10000NM0086,1207


### On a specific day, (e.g. 1st Jan, feb, mar) how many hard drives are there for each model

In [59]:
# model_count_jan = df[df['date'] == '2019-01-01'].groupby('model').size().reset_index(name='1st Jan')
# model_count_feb = df[df['date'] == '2019-02-01'].groupby('model').size().reset_index(name='1st Feb')
# model_count_mar = df[df['date'] == '2019-03-01'].groupby('model').size().reset_index(name='1st Mar')
# 
# # Merge the DataFrames on the 'model' column
# model_count = pd.merge(model_count_jan, model_count_feb, on='model', how='outer')
# model_count = pd.merge(model_count, model_count_mar, on='model', how='outer')
# 
# # Fill NaN values with 0 (models that didn't have data for a specific month)
# model_count = model_count.fillna(0)
# 
# # Sort the dataframe by '1st Jan' in descending order
# model_count = model_count.sort_values(by='1st Jan', ascending=False).reset_index(drop=True)
# 
# model_count

### Join the tables together and compute the annual failure rate
### drive_years = drive_days / 365
### Annual failure rate = (number of failures / number of drive years) * 100

### Written in SQL like this:

CREATE TABLE failure_rates AS
    SELECT drive_days.model AS model,
           drive_days.drive_days AS drive_days,
           failures.failures AS failures, 
           100.0 * (1.0 * failures) / (drive_days / 365.0) AS annual_failure_rate
    FROM drive_days, failures, model_count
    WHERE drive_days.model = failures.model
      AND model_count.model = failures.model
    ORDER BY model;

In [60]:
drive_days

Unnamed: 0,model,drive_days
0,ST12000NM0007,2954794
1,ST4000DM000,1989331
2,ST8000NM0055,1294339
3,HGST HMS5C4040BLE640,1172724
4,ST8000DM002,888712
5,HGST HMS5C4040ALE640,313365
6,HGST HUH721212ALN604,259370
7,ST6000DX000,135832
8,TOSHIBA MG07ACA14TA,109378
9,ST10000NM0086,108555


In [61]:
failures

Unnamed: 0,model,failures
0,ST12000NM0007,174
1,ST4000DM000,105
2,ST8000NM0055,57
3,ST8000DM002,27
4,TOSHIBA MQ01ABF050,13
5,HGST HMS5C4040BLE640,11
6,ST500LM030,8
7,HGST HUH721212ALN604,4
8,TOSHIBA MQ01ABF050M,3
9,HGST HUH728080ALE600,3


In [62]:
model_count

Unnamed: 0,model,count
0,ST12000NM0007,32245
1,ST4000DM000,22854
2,ST8000NM0055,14383
3,HGST HMS5C4040BLE640,12880
4,ST8000DM002,9875
5,HGST HMS5C4040ALE640,3653
6,HGST HUH721212ALN604,2475
7,ST6000DX000,1524
8,TOSHIBA MG07ACA14TA,1220
9,ST10000NM0086,1207


In [63]:
# Merge dataframes
# The on='model' argument in the pd.merge function indicates that the merge should be performed 
# based on the model column, and only rows with matching model values in both DataFrames will 
# be included in the result.
merged_df = pd.merge(drive_days, failures, on='model')
merged_df = pd.merge(merged_df, model_count, on='model')
merged_df

Unnamed: 0,model,drive_days,failures,count
0,ST12000NM0007,2954794,174,32245
1,ST4000DM000,1989331,105,22854
2,ST8000NM0055,1294339,57,14383
3,HGST HMS5C4040BLE640,1172724,11,12880
4,ST8000DM002,888712,27,9875
5,HGST HMS5C4040ALE640,313365,2,3653
6,HGST HUH721212ALN604,259370,4,2475
7,ST6000DX000,135832,1,1524
8,ST10000NM0086,108555,3,1207
9,HGST HUH728080ALE600,93598,3,1045


In [64]:
# Calculate annual failure rate
failure_rates = merged_df.copy()
failure_rates['drive_years'] = failure_rates['drive_days'] / 365
failure_rates['annual_failure_rate (%)'] = (failure_rates['failures'] / failure_rates['drive_years']) * 100
failure_rates

Unnamed: 0,model,drive_days,failures,count,drive_years,annual_failure_rate (%)
0,ST12000NM0007,2954794,174,32245,8095.326027,2.149388
1,ST4000DM000,1989331,105,22854,5450.221918,1.926527
2,ST8000NM0055,1294339,57,14383,3546.134247,1.607384
3,HGST HMS5C4040BLE640,1172724,11,12880,3212.942466,0.342365
4,ST8000DM002,888712,27,9875,2434.827397,1.108908
5,HGST HMS5C4040ALE640,313365,2,3653,858.534247,0.232955
6,HGST HUH721212ALN604,259370,4,2475,710.60274,0.562902
7,ST6000DX000,135832,1,1524,372.142466,0.268714
8,ST10000NM0086,108555,3,1207,297.410959,1.008705
9,HGST HUH728080ALE600,93598,3,1045,256.432877,1.169897


### Testing out rule 2 on table 6 from https://www.kdd.org/kdd2016/papers/files/adf0849-botezatuA.pdf
### What it says is that if the hard drive is a Seagate model and the smart_197_raw sensor has a value >= 2, then the drive should be replaced with 100% confidence rate.
### After some research, it seems that the models that start with 'ST' are the Seagate models

In [70]:
filtered_df = df[(df['model'].str.startswith('ST') & (df['smart_197_raw'] >= 2))]
filtered_df[['date', 'model', 'failure', 'smart_197_raw']].tail(10)

Unnamed: 0,date,model,failure,smart_197_raw
9575257,2019-03-31,ST12000NM0007,0,8.0
9575543,2019-03-31,ST8000DM002,0,16.0
9575590,2019-03-31,ST12000NM0007,0,8.0
9575732,2019-03-31,ST8000NM0055,0,8.0
9575797,2019-03-31,ST12000NM0007,0,24.0
9576059,2019-03-31,ST12000NM0007,0,8.0
9576061,2019-03-31,ST8000NM0055,0,8.0
9576466,2019-03-31,ST8000NM0055,0,8.0
9576491,2019-03-31,ST12000NM0007,0,8.0
9576565,2019-03-31,ST8000NM0055,0,8.0


In [78]:
filtered_df.shape

(45593, 33)

### The failures show 0 above because when a hard drive has failed, it gets restored and goes back in operation and the 'failure' will go back to 0. So checking the end date will not do

# Finding out the first time these specific models fail

In [79]:
# Convert 'date' to datetime format if not already
filtered_df['date'] = pd.to_datetime(filtered_df['date'])  

# Group by 'model' and find the first occurrence of failure using idxmin()
first_failure_index = filtered_df[filtered_df['failure'] == 1].groupby('model')['date'].idxmin()

# Filter the original DataFrame using the indices of the first failure
first_failure_df = filtered_df.loc[first_failure_index]
first_failure_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['date'] = pd.to_datetime(filtered_df['date'])


Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_193_normalized,smart_193_raw,smart_194_normalized,smart_194_raw,smart_197_normalized,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw
2483822,2019-01-24,PL2331LAGPKGSJ,HGST HMS5C4040ALE640,4000787030016,1,99.0,196608.0,100.0,554.0,100.0,...,100.0,589.0,230.0,26.0,100.0,32.0,100.0,0.0,200.0,0.0
752883,2019-01-08,PL2331LAH9R55J,HGST HMS5C4040BLE640,4000787030016,1,100.0,0.0,100.0,441.0,100.0,...,100.0,234.0,171.0,35.0,100.0,56.0,100.0,0.0,200.0,0.0
8085177,2019-03-17,8HJM5LKH,HGST HUH721212ALN604,12000138625024,1,100.0,0.0,100.0,0.0,100.0,...,100.0,107.0,181.0,33.0,100.0,11.0,100.0,0.0,200.0,0.0
1633112,2019-01-16,ZA21CC6L,ST10000NM0086,10000831348736,1,83.0,196290136.0,99.0,0.0,100.0,...,100.0,653.0,27.0,27.0,100.0,16.0,100.0,16.0,200.0,0.0
1074106,2019-01-10,ZCH06MRJ,ST12000NM0007,12000138625024,1,75.0,30439440.0,96.0,0.0,100.0,...,99.0,3363.0,35.0,35.0,100.0,40.0,100.0,40.0,200.0,0.0
71537,2019-01-01,S300ZREZ,ST4000DM000,4000787030016,1,97.0,137747432.0,92.0,0.0,100.0,...,83.0,35093.0,25.0,25.0,100.0,24.0,100.0,24.0,200.0,0.0
9055269,2019-03-27,ZDEABF9Y,ST500LM030,500107862016,1,58.0,78059592.0,100.0,0.0,100.0,...,100.0,10.0,32.0,32.0,100.0,16.0,100.0,16.0,200.0,0.0
6805009,2019-03-06,Z4D05TMX,ST6000DX000,6001175126016,1,118.0,180663320.0,89.0,0.0,100.0,...,7.0,187255.0,25.0,25.0,100.0,8.0,100.0,8.0,200.0,0.0
813957,2019-01-08,ZA13ZS5A,ST8000DM002,8001563222016,1,69.0,233845112.0,91.0,0.0,100.0,...,92.0,16213.0,38.0,38.0,100.0,8.0,100.0,8.0,200.0,0.0
413731,2019-01-04,ZA18145R,ST8000NM0055,8001563222016,1,82.0,161723602.0,92.0,0.0,100.0,...,95.0,10426.0,47.0,47.0,100.0,8.0,100.0,8.0,200.0,0.0


### Only 10 of the 49,593 drives failed within the first 3 months of 2019 that were Seagate drives and had a smart_197_raw value of 2 or more. So it seems like rule 2 on Table 6 is inaccurate.