In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
original_csv = 'ai4i2020.csv'
copied_csv = 'ai4i2020_copy.csv'
dirty_csv = 'ai4i2020_dirty.csv'

In [3]:
df = pd.read_csv(copied_csv)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
df.shape

(10000, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [6]:
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
def insert_missing_values(df, frac=0.05):
    mask = np.random.rand(*df.shape) < frac #tam gdzie True, wartość nan zostanie wstawiona, liczby z przedziału 0-1 rozłożone równomiernie
    df_where = df.mask(mask)
    return df_where

In [8]:
def add_duplicated_rows(df, n_dup=67):
    dup = df.sample(n_dup, replace=False) #każdy wiersz może zostać wybrany tylko raz
    return pd.concat([df, dup], ignore_index=True)

In [9]:
def insert_typos(df, colname, prob=0.05):
    def typo(x):
        if pd.isna(x):
            return x #jeśli jest tam nan, zostaw nan
        if random.random() < prob: # losuje randomową liczbę z przedziału [0,1)
            s = str(x)
            ops = [
                lambda t: t.lower(),
                lambda t: ' ' + t, # Dodaje spację na początku
                lambda t: '"' + t + '"'
            ]
            return random.choice(ops)(s)
        return x
    df[colname] = df[colname].apply(typo)
    return df

In [10]:
def mix_temp_units(df, colname, frac=0.07):
    if colname not in df.columns: 
        return df
    idx = df.sample(frac=frac, random_state=3).index
    # Kelvin -> Celsius: C = K - 273.15
    df.loc[idx, colname] = df.loc[idx, colname] - 273.15
    df.loc[idx, colname] = df.loc[idx, colname].round(2).astype(str) + " °C"
    return df

In [11]:
def insert_negative_values(df, colname, frac=0.05):
    if colname not in df.columns: return df
    idx = df.sample(frac=frac, random_state=4).index
    df.loc[idx, colname] = -abs(df.loc[idx, colname].astype(float))
    return df

In [12]:
def insert_outliers(df, colname='Rotational speed [rpm]', frac=0.01):
    if colname not in df.columns: return df
    idx = df.sample(frac=frac, random_state=5).index
    df.loc[idx, colname] = abs(df.loc[idx, colname].astype(float)) + 2000
    return df

In [13]:
def add_useless_column(df):
    df['Temperature Comment'] = '-'
    return df

In [14]:
df_dirty = df.copy()

# Losowe NaNy 
df_dirty = insert_missing_values(df_dirty)

# Literówki w kategoriach
if 'Product ID' in df_dirty.columns:
    df_dirty = insert_typos(df_dirty, 'Product ID')
if 'Type' in df_dirty.columns:
    df_dirty = insert_typos(df_dirty, 'Type')

# Mieszaj jednostki temperatury (Kelvin vs Celsius strings)
df_dirty = mix_temp_units(df_dirty, colname='Air temperature [K]')
df_dirty = mix_temp_units(df_dirty, colname='Process temperature [K]')

# Wstaw kilka ujemnych wartości tam, gdzie nie powinno ich być
df_dirty = insert_negative_values(df_dirty, colname='Tool wear [min]')

# Wstaw kilka outlierów wartości tam, gdzie nie powinno ich być
df_dirty = insert_outliers(df_dirty)

#Dodaj kolumny bezużyteczne
df_dirty = add_useless_column(df_dirty)

# Dodaj pełne duplikaty
df_dirty = add_duplicated_rows(df_dirty)

# Podgląd
print("Nowy rozmiar (po dodaniu duplikatów):", df_dirty.shape)
df_dirty.head(10)

Nowy rozmiar (po dodaniu duplikatów): (10067, 15)


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Temperature Comment
0,,M14860,M,24.95 °C,35.45 °C,1551.0,42.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
1,2.0,L47181,L,298.2,308.7,1408.0,46.3,3.0,0.0,,0.0,0.0,0.0,0.0,-
2,3.0,L47182,L,298.1,308.5,1498.0,49.4,5.0,0.0,0.0,0.0,0.0,0.0,0.0,-
3,4.0,l47183,L,,308.6,1433.0,39.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,-
4,5.0,L47184,L,298.2,308.7,1408.0,40.0,,0.0,0.0,0.0,0.0,0.0,0.0,-
5,6.0,,M,298.1,308.6,1425.0,41.9,11.0,0.0,0.0,0.0,0.0,0.0,0.0,-
6,7.0,L47186,L,298.1,308.6,1558.0,42.4,14.0,0.0,0.0,,0.0,0.0,0.0,-
7,8.0,L47187,L,298.1,,1527.0,40.2,16.0,0.0,,0.0,0.0,0.0,0.0,-
8,9.0,M14868,m,298.3,308.7,1667.0,28.6,18.0,,0.0,0.0,0.0,0.0,0.0,-
9,10.0,M14869,M,298.5,309.0,1741.0,28.0,21.0,0.0,0.0,,0.0,0.0,0.0,-


In [15]:
df_dirty.shape

(10067, 15)

In [16]:
df_dirty.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Temperature Comment
0,,M14860,M,24.95 °C,35.45 °C,1551.0,42.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
1,2.0,L47181,L,298.2,308.7,1408.0,46.3,3.0,0.0,,0.0,0.0,0.0,0.0,-
2,3.0,L47182,L,298.1,308.5,1498.0,49.4,5.0,0.0,0.0,0.0,0.0,0.0,0.0,-
3,4.0,l47183,L,,308.6,1433.0,39.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,-
4,5.0,L47184,L,298.2,308.7,1408.0,40.0,,0.0,0.0,0.0,0.0,0.0,0.0,-


In [17]:
df_dirty.to_csv(dirty_csv, index=False)