### Importing initial modules

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
os.getcwd()

### Changing to the correct directory if not already

In [None]:
path = '/Users/baari/Desktop/hard-drive-predictive-maintenance'
os.chdir(path)
os.getcwd()

In [None]:
df = pd.read_csv('Q1_2019.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df['failure'].value_counts()

### Handling missing values

In [None]:
total_cells = np.prod(df.shape)
total_cells

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count[0:10]

In [None]:
total_missing = missing_values_count.sum()
total_missing

In [None]:
percent_missing = total_missing / total_cells * 100
print(f'{percent_missing}% of the data is missing')

In [None]:
columns_before = df.columns

# Remove columns that have all null values
df = df.dropna(axis=1, how='all')

columns_after = df.columns

removed_columns = set(columns_before) - set(columns_after)
print(f'Columns removed: {removed_columns}')

In [None]:
total_cells = np.prod(df.shape)
total_missing = df.isnull().sum().sum()
percent_missing = total_missing / total_cells * 100
print(f'{percent_missing}% of the data is missing')

# Computing failure rates

### Creating a dataframe that has the number of drive days for each model. Drive days refers to the number of days a hard drive has been running (the number of rows in the main dataframe for that model).

### Written in SQL like this:

CREATE TABLE drive_days AS 
    SELECT model, count(*) AS drive_days 
    FROM drive_stats 
    GROUP BY model;

In [None]:
# Groups the dataframe by the 'model' column and calculates the size (number of rows) for each group
# The .size() gets the count of occurrences for each model
drive_days = df.groupby('model').size().reset_index(name='drive_days')

# Sort the dataframe by 'drive_days' in descending order
drive_days = drive_days.sort_values(by='drive_days', ascending=False).reset_index(drop=True)

drive_days

In [None]:
# Total drive days across all models (which is also the total rows)
total_drive_days = len(df)
total_drive_days

### Creating a table that has the number of failures for each model.

### Written in SQL like this:

CREATE TABLE failures AS
    SELECT model, count(*) AS failures
    FROM drive_stats
    WHERE failure = 1
    GROUP BY model;

In [None]:
failures = df[df['failure'] == 1].groupby('model').size().reset_index(name='failures')
failures = failures.sort_values(by='failures', ascending=False).reset_index(drop=True)
failures

In [None]:
# Calculating the total failures
total_failures = (df['failure'] == 1).sum()
print(f'Total failures: {total_failures}')

# Calculating the total unique days
total_unique_days = df['date'].nunique()
print(f'Total unique days: {total_unique_days}')

print(f'Total drive days: {total_drive_days}')

### So we have 428 drive failures in 9,577,046 drive days of operation.
### The daily failure rate is the drive failures / drive days.
### The annual failure rate would be the daily failure rate * 365 (assuming the rest of the year would have similar results to the first 3 months)

In [None]:
daily_failure_rate = (total_failures / total_drive_days) * 100
annual_failure_rate = daily_failure_rate * 365
print(f'Daily failure rate is {daily_failure_rate}%')
print(f'Annual failure rate is {annual_failure_rate}%')

### Creating a table that has the number of drives for each model as of January 31st 2019

### Written in SQL like this:

CREATE TABLE model_count AS
    SELECT model, count(*) AS count
    FROM drive_stats
    WHERE date = '2019-01-31'
    GROUP BY model;

In [None]:
# Convert 'date' column to datetime type (if not already)
df['date'] = pd.to_datetime(df['date'])

# Filter rows for specific date and calculate the number of drives for each model
model_count = df[df['date'] == '2019-01-31'].groupby('model').size().reset_index(name='count')

# Sort the dataframe by 'count' in descending order
model_count = model_count.sort_values(by='count', ascending=False).reset_index(drop=True)
model_count

### On a specific day, (e.g. 1st Jan, feb, mar) how many hard drives are there for each model

In [None]:
# model_count_jan = df[df['date'] == '2019-01-01'].groupby('model').size().reset_index(name='1st Jan')
# model_count_feb = df[df['date'] == '2019-02-01'].groupby('model').size().reset_index(name='1st Feb')
# model_count_mar = df[df['date'] == '2019-03-01'].groupby('model').size().reset_index(name='1st Mar')
# 
# # Merge the DataFrames on the 'model' column
# model_count = pd.merge(model_count_jan, model_count_feb, on='model', how='outer')
# model_count = pd.merge(model_count, model_count_mar, on='model', how='outer')
# 
# # Fill NaN values with 0 (models that didn't have data for a specific month)
# model_count = model_count.fillna(0)
# 
# # Sort the dataframe by '1st Jan' in descending order
# model_count = model_count.sort_values(by='1st Jan', ascending=False).reset_index(drop=True)
# 
# model_count

### Join the tables together and compute the annual failure rate
### drive_years = drive_days / 365
### Annual failure rate = (number of failures / number of drive years) * 100

### Written in SQL like this:

CREATE TABLE failure_rates AS
    SELECT drive_days.model AS model,
           drive_days.drive_days AS drive_days,
           failures.failures AS failures, 
           100.0 * (1.0 * failures) / (drive_days / 365.0) AS annual_failure_rate
    FROM drive_days, failures, model_count
    WHERE drive_days.model = failures.model
      AND model_count.model = failures.model
    ORDER BY model;

In [None]:
drive_days

In [None]:
failures

In [None]:
model_count

In [None]:
# Merge dataframes
# The on='model' argument in the pd.merge function indicates that the merge should be performed 
# based on the model column, and only rows with matching model values in both DataFrames will 
# be included in the result.
merged_df = pd.merge(drive_days, failures, on='model')
merged_df = pd.merge(merged_df, model_count, on='model')
merged_df

In [None]:
# Calculate annual failure rate
failure_rates = merged_df.copy()
failure_rates['drive_years'] = failure_rates['drive_days'] / 365
failure_rates['annual_failure_rate (%)'] = (failure_rates['failures'] / failure_rates['drive_years']) * 100
failure_rates

### Testing out rule 2 on table 6 from https://www.kdd.org/kdd2016/papers/files/adf0849-botezatuA.pdf
### Seems like the information is not true

In [None]:
filtered_df = df[df['smart_197_raw'] >= 2]
filtered_df[['model', 'failure', 'smart_197_raw']].head(10)