All imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import redshift_connector
import seaborn as sns
import os
from datetime import datetime
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM


Setting the context

In [None]:
random_number = random.randint(1, 1000)

# Get the current date and time
now = datetime.now()
formatted_now = now.strftime("%Y_%m_%d_%H_%M_%S")
print("Random Number:", random_number)
print("Formatted date and time:", formatted_now)
# Specify the directory path where you want to save the CSV file
directory = f'EDA_{formatted_now}_{random_number}'

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

Read Input Data from CSVs

In [None]:
df_data_with_max_poh = pd.DataFrame()

temp_df = pd.read_csv("df_data_with_max_poh_failed_drives.csv")
temp_df['Failed_Drive']='YES'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

temp_df = pd.read_csv("df_data_with_max_poh_april_2021.csv")
temp_df['Failed_Drive']='NO'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

temp_df = pd.read_csv("df_data_with_max_poh_may_2021.csv")
temp_df['Failed_Drive']='NO'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

temp_df = pd.read_csv("df_data_with_max_poh_june_2021.csv")
temp_df['Failed_Drive']='NO'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

temp_df = pd.read_csv("df_data_with_max_poh_july_2021.csv")
temp_df['Failed_Drive']='NO'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

temp_df = pd.read_csv("df_data_with_max_poh_august_2021.csv")
temp_df['Failed_Drive']='NO'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

temp_df = pd.read_csv("df_data_with_max_poh_sep_2021.csv")
temp_df['Failed_Drive']='NO'
df_data_with_max_poh = pd.concat([df_data_with_max_poh, temp_df])

#cleaning
df_data_with_max_poh = df_data_with_max_poh.fillna(0)
df_data_with_max_poh = df_data_with_max_poh.reset_index(drop=True)
df_data_with_max_poh

Adding Failed drives info

In [None]:
file_path = os.path.join(directory, 'df_data_with_max_poh_combined.csv')
df_data_with_max_poh.to_csv(file_path)

Get nunique to show the number of unique values in each column

In [None]:
nunique = df_data_with_max_poh.nunique()
nunique.columns=['column','uniquevalues']
nunique.to_csv('data_uniqueness.csv')
nunique
unique_percentages = (df_data_with_max_poh.nunique() / df_data_with_max_poh.shape[0]) * 100
plt.figure(figsize=(10, 6))
unique_percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage of Unique Values in Each Column')
plt.xlabel('Columns')
plt.ylabel('Percentage of Unique Values')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

EDA

In [None]:
df_data_with_max_poh.describe()

In [None]:
nunique = df_data_with_max_poh.nunique()
nunique.columns=['column','uniquevalues']
unique_percentages = (df_data_with_max_poh.nunique() / df_data_with_max_poh.shape[0]) * 100
plt.figure(figsize=(10, 6))
unique_percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage of Unique Values in Each Column')
plt.xlabel('Columns')
plt.ylabel('Percentage of Unique Values')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
file_path = os.path.join(directory, 'features_info.csv')
nunique.to_csv(file_path)

In [None]:
df_data_with_max_poh.shape

Limiting to columns listed on Confluence

In [None]:
with pd.option_context('display.float_format', lambda x: '%.2f' % x):
    df_data_with_max_poh_desc = df_data_with_max_poh[['power_on_hours','recycle_data_recovered_count','recycle_lpages_read','recycle_lpages_written','recycled_sb_due_to_fault','recycled_sb_due_to_read_disturb_audit_closed_block','recycled_sb_due_to_read_disturb_no_audit_when_only_WL0_written','recycled_sb_due_to_read_disturb_open_block_audit_on_alternate_WL','recycled_sb_due_to_read_error_recovery_deep_retry','recycled_sb_that_received_pre_program_erase_and_had_DI','recycled_sb_that_were_open_and_had_DI','recycled_sb_that_were_open_subset_that_received_pre_program_erase','recycled_sb_that_were_open_when_recycled','a2f_op_read_recovery_count','host_lpages_recovered_with_frame_FP_Worker0','host_lpages_recovered_with_frame_FP_Worker1','host_lpages_recovered_with_read_retries_FP_Worker0','host_lpages_recovered_with_read_retries_FP_Worker1','host_read_recovered_count','read_recovery_fail_count_recipe_01_proceed_to_FRAME','read_recovery_fm0_sl0_ch0','read_recovery_fm0_sl0_ch1','read_recovery_fm0_sl0_ch2','read_recovery_fm0_sl0_ch3','read_recovery_fm0_sl1_ch0','read_recovery_fm0_sl1_ch1','read_recovery_fm0_sl1_ch2','read_recovery_lower_page_lpage_retry_count','read_recovery_middle_page_lpage_retry_count','read_recovery_pMLC_lpage_retry_count','read_recovery_page_count_1st_highest','read_recovery_page_count_2nd_highest','read_recovery_page_number_1st_highest','read_recovery_page_number_2nd_highest','read_recovery_page_number_3rd_highest','read_recovery_retry_attempt_count_during_erase_suspend','read_recovery_success_count_recipe_01_bes_hard_read','read_recovery_success_count_recipe_01_cmd_CF','read_recovery_success_count_recipe_01_dynamic_read_shallow','read_recovery_success_count_recipe_01_soft_read','read_recovery_total_retry_attempt_count','read_recovery_upper_page_lpage_retry_count','glist_grown_blocks_for_frame','glist_grown_blocks_for_program_fail','grown_blocks_for_erase_fail','grown_blocks_for_frame','grown_blocks_for_program_fail','drive_average_ber','lower_page_average_ber','middle_page_average_ber','upper_page_average_ber','ifs_read_error_counts_nand','p0_smart_error_logs','ifs_erase_counts_nand','read_fbc_warning_threshold_exceeded','smart_warning_composite_temperature_time','case_composite_temperature','max_case_composite_temperature','drive_life_remaining','fe_core_voltage','fe_process','fe_temperature_sensor','life_used_percentage_x100','life_used_percentage_x100_internal','p0_smart_safe_shutdowns','num_discharge_tests_run_since_factory','zq_calibration_issued_to_die_count','pfail']].describe()
    print (df_data_with_max_poh_desc)

In [None]:
max_values = final_data.max()
print(max_values)
# Plotting
max_values.plot(kind='bar')
plt.title('Maximum Values in Each Column')
plt.ylabel('Max Value')
plt.xlabel('Columns')
plt.xticks(rotation=0)  # Keep the column names horizontal for readability
plt.show()

In [None]:
nuniquef_data = final_data.nunique()
nuniquef_data.columns=['column','uniquevalues']
unique_percentages = (final_data.nunique() / final_data.shape[0]) * 100
plt.figure(figsize=(10, 6))
unique_percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage of Unique Values in Each Column on FINAL DATA')
plt.xlabel('Columns')
plt.ylabel('Percentage of Unique Values')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
f_data_corr= final_data.corr()
plt.imshow(f_data_corr, cmap='viridis', interpolation='nearest')
plt.colorbar()  
plt.title('Correlation Matrix')
plt.show()

Correlation between features

In [None]:
plt.figure(figsize=(50, 50))  
sns.heatmap(f_data_corr, annot=True, fmt=".2f", cmap='coolwarm',
            xticklabels=f_data_corr.columns,
            yticklabels=f_data_corr.columns)
plt.title('Correlation Matrix using data from April-2021 to Sep-2021 and data of failed drives')
plt.show()

In [None]:
final_data.hist(bins=30, figsize=(20, 20))
plt.show()