# 📊 IoT Dataset – EDA Notebook

In [5]:

# --- Imports ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



# --- Load Dataset ---

In [6]:

df = pd.read_csv('../data/smart_manufacturing_data.csv')  # Adjust path if needed
print("Shape:", df.shape)
df.head()



Shape: (100000, 13)


Unnamed: 0,timestamp,machine_id,temperature,vibration,humidity,pressure,energy_consumption,machine_status,anomaly_flag,predicted_remaining_life,failure_type,downtime_risk,maintenance_required
0,2025-01-01 00:00:00,39,78.61,28.65,79.96,3.73,2.16,1,0,106,Normal,0.0,0
1,2025-01-01 00:01:00,29,68.19,57.28,35.94,3.64,0.69,1,0,320,Normal,0.0,0
2,2025-01-01 00:02:00,15,98.94,50.2,72.06,1.0,2.49,1,1,19,Normal,1.0,1
3,2025-01-01 00:03:00,43,90.91,37.65,30.34,3.15,4.96,1,1,10,Normal,1.0,1
4,2025-01-01 00:04:00,8,72.32,40.69,56.71,2.68,0.63,2,0,65,Vibration Issue,0.0,1


In [7]:
# --- Basic Info ---
df.info()
df.describe()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   timestamp                 100000 non-null  object 
 1   machine_id                100000 non-null  int64  
 2   temperature               100000 non-null  float64
 3   vibration                 100000 non-null  float64
 4   humidity                  100000 non-null  float64
 5   pressure                  100000 non-null  float64
 6   energy_consumption        100000 non-null  float64
 7   machine_status            100000 non-null  int64  
 8   anomaly_flag              100000 non-null  int64  
 9   predicted_remaining_life  100000 non-null  int64  
 10  failure_type              100000 non-null  object 
 11  downtime_risk             100000 non-null  float64
 12  maintenance_required      100000 non-null  int64  
dtypes: float64(6), int64(5), object(2)
memory usa

Unnamed: 0,machine_id,temperature,vibration,humidity,pressure,energy_consumption,machine_status,anomaly_flag,predicted_remaining_life,downtime_risk,maintenance_required
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,25.49933,75.015625,50.01227,54.995401,3.000405,2.747064,1.00205,0.08916,234.26916,0.089155,0.19697
std,14.389439,10.031884,14.985444,14.43796,1.152399,1.297865,0.446193,0.284976,150.063062,0.284961,0.397711
min,1.0,35.55,-17.09,30.0,1.0,0.5,0.0,0.0,1.0,0.0,0.0
25%,13.0,68.2675,39.97,42.52,2.0,1.63,1.0,0.0,97.0,0.0,0.0
50%,25.0,75.06,49.96,54.98,3.01,2.74,1.0,0.0,230.0,0.0,0.0
75%,38.0,81.75,60.1,67.5,4.0,3.87,1.0,0.0,365.0,0.0,0.0
max,50.0,121.94,113.8,80.0,5.0,5.0,2.0,1.0,499.0,1.0,1.0


In [None]:
# --- Check for Missing Values ---
print("\nMissing values:")
print(df.isnull().sum())



In [None]:
# --- Class Balance (e.g., status column) ---
if 'status' in df.columns:
    print("\nClass distribution:")
    print(df['status'].value_counts())



In [None]:
# --- Timestamp Check (optional) ---
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)



# --- Visualize Sensor Trends (First 3 Sensors) ---

In [None]:

sensor_cols = [col for col in df.columns if 'sensor' in col][:3]

for col in sensor_cols:
    plt.figure(figsize=(12, 4))
    sns.lineplot(data=df, x=df.index, y=col, hue='status' if 'status' in df.columns else None)
    plt.title(f"{col} over Time")
    plt.xlabel("Time")
    plt.ylabel("Reading")
    plt.tight_layout()
    plt.show()



In [None]:
# --- Correlation Heatmap (Optional) ---
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm', annot=True, fmt=".2f")
plt.title("Correlation Matrix")
plt.show()