In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Number of production runs
num_runs = 4161

In [None]:
# Generate simulated data with additional features and quality issues
data = {
    "Production Run ID": range(1, num_runs + 1),
    "Date": pd.date_range(start='2023-01-01', periods=num_runs, freq='h'),
    "Temperature (°C)": np.random.normal(loc=200, scale=10, size=num_runs),
    "Pressure (Pa)": np.random.normal(loc=5000, scale=200, size=num_runs),
    "Cooling Rate (°C/min)": np.random.normal(loc=5, scale=0.5, size=num_runs),
    "Machine Speed (RPM)": np.random.normal(loc=1500, scale=50, size=num_runs),
    "Raw Material Quality (Score)": np.random.normal(loc=85, scale=5, size=num_runs),
    "Humidity (%)": np.random.normal(loc=50, scale=5, size=num_runs),
    "Ambient Temperature (°C)": np.random.normal(loc=25, scale=3, size=num_runs),
    "Maintenance (Days Since)": np.random.poisson(lam=30, size=num_runs),
    "Operator Shift": np.random.choice(['Day', 'Night'], size=num_runs),
    "Batch Size (Units)": np.random.poisson(lam=100, size=num_runs),
    "Energy Consumption (kWh)": np.random.normal(loc=50, scale=5, size=num_runs),
    "Production Line": np.random.choice(['Line 1', 'Line 2', 'Line 3'], size=num_runs),
    "Production Output (Units)": np.random.poisson(lam=1000, size=num_runs),
    "Defect Rate (%)": np.random.normal(loc=2, scale=0.5, size=num_runs).clip(0, 100),
    "Downtime (Minutes)": np.random.poisson(lam=30, size=num_runs)
}

# Column Descriptions

* `Production Run ID`: Unique identifier for each production run.
* `Date`: The date of the production run.
* `Temperature (°C)`: Temperature setting during extrusion process.
* `Pressure (Pa)`: Pressure setting during extrusion process.
* `Cooling Rate (°C/min)`: Rate at which the plastic is cooled.
* `Machine Speed (RPM)`: Operating speed of the production machine.
* `Raw Material Quality (Score)`: Quality score of the raw material used.
* `Humidity (%)`: The humidity level in the production environment.
* `Ambient Temperature (°C)`: The temperature of the surrounding environment during production.
* `Maintenance (Days Since)`: The number of days since the last maintenance activity on the production equipment.
* `Operator Shift`: Indicates whether the production run occurred during the 'Day' or 'Night' shift.
* `Batch Size (Units)`: The size of the production batch in units, representing the quantity of material processed in one run.
* `Energy Consumption (kWh)`: The amount of energy consumed during the production run.
* `Production Line`: Identifier for the production line (e.g., 'Line 1', 'Line 2', 'Line 3') where the run occurred.
* `Production Output (Units)`: Number of units produced in the run.
* `Defect Rate (%)`: Percentage of units with defects.
* `Downtime (Minutes)`: Duration of machine downtime during the run.

In [None]:
# Convert relevant columns to float to handle NaN values
for key in ["Temperature (°C)", "Pressure (Pa)", "Cooling Rate (°C/min)", "Machine Speed (RPM)",
            "Raw Material Quality (Score)", "Humidity (%)", "Ambient Temperature (°C)",
            "Maintenance (Days Since)", "Batch Size (Units)", "Energy Consumption (kWh)",
            "Defect Rate (%)", "Downtime (Minutes)"]:
    data[key] = data[key].astype(float)

In [None]:
# Introduce missing values
for key in ["Temperature (°C)", "Pressure (Pa)", "Cooling Rate (°C/min)", "Machine Speed (RPM)",
            "Raw Material Quality (Score)", "Humidity (%)", "Ambient Temperature (°C)",
            "Maintenance (Days Since)", "Batch Size (Units)", "Energy Consumption (kWh)",
            "Defect Rate (%)", "Downtime (Minutes)"]:
    # Randomly set 5% of values to NaN
    nan_indices = np.random.choice(num_runs, size=int(num_runs * 0.05), replace=False)
    for idx in nan_indices:
        data[key][idx] = np.nan

In [None]:
# Introduce outliers
# Randomly set 2% of values to extreme outliers
outlier_indices = np.random.choice(num_runs, size=int(num_runs * 0.02), replace=False)
for idx in outlier_indices:
    data["Temperature (°C)"][idx] *= 2
    data["Pressure (Pa)"][idx] *= 3
    data["Cooling Rate (°C/min)"][idx] *= 0.1
    data["Machine Speed (RPM)"][idx] *= 1.5
    data["Raw Material Quality (Score)"][idx] *= 0.5
    data["Humidity (%)"][idx] *= 1.5
    data["Ambient Temperature (°C)"][idx] *= 1.5
    data["Maintenance (Days Since)"][idx] *= 5
    data["Batch Size (Units)"][idx] *= 5
    data["Energy Consumption (kWh)"][idx] *= 2
    data["Defect Rate (%)"][idx] *= 5
    data["Downtime (Minutes)"][idx] *= 10

In [None]:
# Introduce inconsistent data entry formats
# Randomly switch units for 1% of values
inconsistent_indices = np.random.choice(num_runs, size=int(num_runs * 0.01), replace=False)
for idx in inconsistent_indices:
    data["Temperature (°C)"][idx] = data["Temperature (°C)"][idx] * 9/5 + 32  # Convert to Fahrenheit
    data["Pressure (Pa)"][idx] = data["Pressure (Pa)"][idx] / 1000  # Convert to kPa

In [None]:
# Create DataFrame
production_data_with_issues = pd.DataFrame(data)

In [None]:
# Display the first few rows of the dataset
production_data_with_issues.head()

Unnamed: 0,Production Run ID,Date,Temperature (°C),Pressure (Pa),Cooling Rate (°C/min),Machine Speed (RPM),Raw Material Quality (Score),Humidity (%),Ambient Temperature (°C),Maintenance (Days Since),Operator Shift,Batch Size (Units),Energy Consumption (kWh),Production Line,Production Output (Units),Defect Rate (%),Downtime (Minutes)
0,1,2023-01-01 00:00:00,199.587579,5187.346865,4.19702,1467.853751,77.249494,54.419352,22.91629,36.0,Night,104.0,50.311084,Line 1,963,0.925473,25.0
1,2,2023-01-01 01:00:00,224.800678,5214.943372,4.561051,1522.273687,76.70063,56.171668,22.872015,28.0,Night,,46.691092,Line 1,1043,2.0712,37.0
2,3,2023-01-01 02:00:00,224.027162,4856.441687,4.936852,1423.536278,80.54151,49.527377,29.149278,25.0,Day,97.0,41.040091,Line 1,969,2.387125,30.0
3,4,2023-01-01 03:00:00,204.448237,5105.486001,5.156183,1553.104489,85.953493,51.710811,27.805151,24.0,Day,,49.862947,Line 3,956,1.419953,26.0
4,5,2023-01-01 04:00:00,200.254929,4831.980194,5.514289,1467.818958,,56.596582,27.507715,37.0,Day,112.0,47.542142,Line 3,1012,,28.0


In [None]:
production_data_with_issues.tail()

Unnamed: 0,Production Run ID,Date,Temperature (°C),Pressure (Pa),Cooling Rate (°C/min),Machine Speed (RPM),Raw Material Quality (Score),Humidity (%),Ambient Temperature (°C),Maintenance (Days Since),Operator Shift,Batch Size (Units),Energy Consumption (kWh),Production Line,Production Output (Units),Defect Rate (%),Downtime (Minutes)
4156,4157,2023-06-23 04:00:00,189.638452,5151.881571,5.669565,1551.407802,,40.912097,,23.0,Day,82.0,49.278325,Line 1,993,2.347394,30.0
4157,4158,2023-06-23 05:00:00,193.731484,4988.178684,4.647619,1391.685802,76.094594,52.875764,25.623205,20.0,Night,91.0,50.066738,Line 2,1044,2.070173,24.0
4158,4159,2023-06-23 06:00:00,193.582285,4596.611084,5.004593,1598.928917,86.500505,40.742736,,25.0,Night,95.0,46.368262,Line 1,986,1.84811,24.0
4159,4160,2023-06-23 07:00:00,200.747451,5154.837387,6.148276,1488.179566,81.996272,50.838028,23.821743,22.0,Night,101.0,51.086368,Line 1,1007,1.891983,34.0
4160,4161,2023-06-23 08:00:00,190.912541,5017.791949,,1466.632689,84.916164,47.198246,22.993114,29.0,Night,101.0,,Line 3,1017,2.359293,31.0


In [None]:
# Shape of data
production_data_with_issues.shape

(4161, 17)

In [None]:
production_data_with_issues.to_csv("/content/drive/MyDrive/data/smart_phone_surface_plastic_manufacture.csv")