In [15]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [18]:
import pandas as pd
import numpy as np

In [19]:
# Set random seed for reproducibility
np.random.seed(42)

In [30]:
# Number of production runs
num_runs = 4161

In [37]:
# Generate simulated data with quality issues
data = {
    "Production Run ID": range(1, num_runs + 1),
    "Date": pd.date_range(start='2023-01-01', periods=num_runs, freq='h'),
    "Temperature (°C)": np.random.normal(loc=200, scale=10, size=num_runs),
    "Pressure (Pa)": np.random.normal(loc=5000, scale=200, size=num_runs),
    "Cooling Rate (°C/min)": np.random.normal(loc=5, scale=0.5, size=num_runs),
    "Machine Speed (RPM)": np.random.normal(loc=1500, scale=50, size=num_runs),
    "Raw Material Quality (Score)": np.random.normal(loc=85, scale=5, size=num_runs),
    "Production Output (Units)": np.random.poisson(lam=1000, size=num_runs),
    "Defect Rate (%)": np.random.normal(loc=2, scale=0.5, size=num_runs).clip(0, 100),
    "Downtime (Minutes)": np.random.poisson(lam=30, size=num_runs)
}

# Column Descriptions

* `Production Run ID`: Unique identifier for each production run.
* `Date`: The date of the production run.
* `Temperature (°C)`: Temperature setting during extrusion process.
* `Pressure (Pa)`: Pressure setting during extrusion process.
* `Cooling Rate (°C/min)`: Rate at which the plastic is cooled.
* `Machine Speed (RPM)`: Operating speed of the production machine.
* `Raw Material Quality (Score)`: Quality score of the raw material used.
* `Production Output (Units)`: Number of units produced in the run.
* `Defect Rate (%)`: Percentage of units with defects.
* `Downtime (Minutes)`: Duration of machine downtime during the run.

In [38]:
# Convert relevant columns to float to handle NaN values
for key in ["Temperature (°C)", "Pressure (Pa)", "Cooling Rate (°C/min)", "Machine Speed (RPM)", "Raw Material Quality (Score)", "Defect Rate (%)", "Downtime (Minutes)"]:
    data[key] = data[key].astype(float)

In [39]:
# Introduce missing values
for key in ["Temperature (°C)", "Pressure (Pa)", "Cooling Rate (°C/min)", "Machine Speed (RPM)", "Raw Material Quality (Score)", "Defect Rate (%)", "Downtime (Minutes)"]:
    # Randomly set 5% of values to NaN
    nan_indices = np.random.choice(num_runs, size=int(num_runs * 0.05), replace=False)
    for idx in nan_indices:
        data[key][idx] = np.nan

In [40]:
# Introduce outliers
# Randomly set 2% of values to extreme outliers
outlier_indices = np.random.choice(num_runs, size=int(num_runs * 0.02), replace=False)
for idx in outlier_indices:
    data["Temperature (°C)"][idx] *= 2
    data["Pressure (Pa)"][idx] *= 3
    data["Cooling Rate (°C/min)"][idx] *= 0.1
    data["Machine Speed (RPM)"][idx] *= 1.5
    data["Raw Material Quality (Score)"][idx] *= 0.5
    data["Defect Rate (%)"][idx] *= 5
    data["Downtime (Minutes)"][idx] *= 10

In [41]:
# Introduce inconsistent data entry formats
# Randomly switch units for 1% of values
inconsistent_indices = np.random.choice(num_runs, size=int(num_runs * 0.01), replace=False)
for idx in inconsistent_indices:
    data["Temperature (°C)"][idx] = data["Temperature (°C)"][idx] * 9/5 + 32  # Convert to Fahrenheit
    data["Pressure (Pa)"][idx] = data["Pressure (Pa)"][idx] / 1000  # Convert to kPa

In [42]:
# Create DataFrame
production_data = pd.DataFrame(data)

In [43]:
# Display the first few rows of the dataset
production_data.head()

Unnamed: 0,Production Run ID,Date,Temperature (°C),Pressure (Pa),Cooling Rate (°C/min),Machine Speed (RPM),Raw Material Quality (Score),Production Output (Units),Defect Rate (%),Downtime (Minutes)
0,1,2023-01-01 00:00:00,202.765807,4978.960092,5.832693,1477.709715,87.429532,983,,32.0
1,2,2023-01-01 01:00:00,178.779437,5035.787242,4.50632,1480.537623,82.774781,1020,1.834823,31.0
2,3,2023-01-01 02:00:00,197.269216,4731.804161,5.467583,1558.95934,74.362611,968,1.457479,39.0
3,4,2023-01-01 03:00:00,204.148492,5131.614042,4.944273,1513.837044,76.464372,974,2.219941,35.0
4,5,2023-01-01 04:00:00,197.390749,,4.780475,1472.712423,84.449217,1043,,39.0


In [44]:
production_data.tail()

Unnamed: 0,Production Run ID,Date,Temperature (°C),Pressure (Pa),Cooling Rate (°C/min),Machine Speed (RPM),Raw Material Quality (Score),Production Output (Units),Defect Rate (%),Downtime (Minutes)
4156,4157,2023-06-23 04:00:00,196.876796,,5.489889,1482.076725,86.043356,984,1.594721,24.0
4157,4158,2023-06-23 05:00:00,197.106062,4787.299216,4.803101,1510.250956,86.281271,1070,1.668975,35.0
4158,4159,2023-06-23 06:00:00,194.162314,4891.326855,4.811696,1393.414666,79.343297,1004,2.112016,37.0
4159,4160,2023-06-23 07:00:00,210.275823,5088.102827,4.36326,1553.205264,83.907424,1008,1.987073,24.0
4160,4161,2023-06-23 08:00:00,205.897066,4967.932648,5.268304,1466.779235,79.475221,1035,2.184708,34.0


In [45]:
# Shape of data
production_data.shape

(4161, 10)

In [46]:
production_data.to_csv("/content/drive/MyDrive/data/smart_phone_surface_plastic_manufacture.csv")