In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import os 

path = os.getcwd()
dataframe = pandas.read_excel(path+'\\dane.xls', header = 9, usecols='K:AM,AT:BF')
test_dataframe = pandas.read_excel(path+'\\dane_test.xlsx', header = 9, usecols='K:AK,AR,AV')

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [4]:
pl_eng_dict = {'Nr źródła': 'Id',
 'Wydzielenia grafitu [mm-2]': 'Graphite emission [mm-2]',
 'Udział wydzieleń grafitu [%]': 'Graphite emission share [%]',
 'Średnica sferoidów [μm]': "Spheroid diameter [μm]",
 'Wielkość sferoidów': 'Spheroid size',
 'Udział perlitu [%]': 'Perlite share [%]',
 'Udział ferrytu [%]': 'Ferrite share [%]',
 'Minimalna grubość ścianki [mm]': 'Min wall thickness [mm]',
 'Temperatura austenityzacji [˚C]': 'Austenitization temp [C]',
 'Czas austenityzacji [min.]': 'Austenitization time [min.]',
 'Temperatura przemiany izotermicznej [˚C]': 'Isothermal process temp [C]',
 'Czas przemiany izotermicznej [min.]': 'Isothermal process time [min.]',
 'Twardość Brinella [HB]': 'HB',
 'Twardość Rockwella [HRC]': 'HRC',
 'Twardość Rockwella [HRA]': 'HRA',
 'Twardość Rockwella [HRB]': 'HRB',
 'Twardość Vickersa [HV]': 'HV',
 'Udarność Charpy [J]': 'Charpy impact strength [J]',
 'Temperatura pomiaru udarności [˚C]': 'Impact test temp [C]',
 'Udział austenitu %': 'Austenite share [%]',
 "Martensite volume fraction Xα'": 'Martensite volume',
 'Retained austenite volume fraction XγR ': 'Retained austenite volume'}

In [5]:
dataframe.rename(columns=pl_eng_dict, inplace=True)
dataframe.head()

Unnamed: 0,Id,C [%],Si [%],S [%],P [%],Mg [%],Mn [%],Ni [%],Cu [%],Mo [%],...,HB,HRC,HRA,HRB,HV,Charpy impact strength [J],Impact test temp [C],Austenite share [%],Martensite volume,Retained austenite volume
0,1,3.4,2.41,0.017,0.015,0.064,0.15,0.001,0.0,0.001,...,,,,,,,,28.7,,
1,1,3.4,2.41,0.017,0.015,0.064,0.15,0.001,0.0,0.001,...,,,,,,,,37.5,,
2,1,3.4,2.41,0.017,0.015,0.064,0.15,0.001,0.0,0.001,...,,,,,,,,32.0,,
3,1,3.4,2.41,0.017,0.015,0.064,0.15,0.001,0.0,0.001,...,,,,,,,,28.5,,
4,1,3.4,2.41,0.017,0.015,0.064,0.15,0.001,0.0,0.001,...,,,,,,,,26.2,,


In [6]:
test_dataframe.rename(columns=pl_eng_dict, inplace=True)
test_dataframe['Min wall thickness [mm]'] = np.nan

df_X, df_y = dataframe.iloc[:, 0:29].copy(), dataframe.iloc[:, 29:].copy()
tdf_X = test_dataframe.copy()

In [7]:
tdf_y = tdf_X[['Rm [MPa]', 'Charpy impact strength [J]']]
tdf_y

Unnamed: 0,Rm [MPa],Charpy impact strength [J]
0,921.0,
1,847.0,
2,812.0,
3,772.0,
4,952.0,
...,...,...
87,953.0,41.0
88,1027.5,21.0
89,1021.4,31.0
90,995.8,35.0


In [8]:
df_X=df_X.drop(['Id', 'Spheroid size'], axis=1)
tdf_X=tdf_X.drop(['Id', 'Rm [MPa]', 'Charpy impact strength [J]'], axis=1)

In [9]:
tdf_X.head()

Unnamed: 0,C [%],Si [%],S [%],P [%],Mg [%],Mn [%],Ni [%],Cu [%],Mo [%],Cr [%],...,Nodularity [%],Perlite share [%],Ferrite share [%],Rm [MPa] (as cast),A5 [%] (as cast),Austenitization temp [C],Austenitization time [min.],Isothermal process temp [C],Isothermal process time [min.],Min wall thickness [mm]
0,3.63,2.52,0.013,0.028,0.0043,0.162,0.024,0.215,0.002,0.011,...,,,,,,925,60,350,60,
1,3.63,2.52,0.013,0.028,0.0043,0.162,0.024,0.215,0.002,0.011,...,,,,,,925,60,375,60,
2,3.63,2.52,0.013,0.028,0.0043,0.162,0.024,0.215,0.002,0.011,...,,,,,,925,60,400,60,
3,3.63,2.52,0.013,0.028,0.0043,0.162,0.024,0.215,0.002,0.011,...,,,,,,925,60,425,60,
4,3.63,2.52,0.013,0.028,0.0043,0.162,0.024,0.215,0.002,0.011,...,,,,,,925,60,350,60,


In [10]:
tdf_X['Min wall thickness [mm]'] = np.nan
tdf_X = tdf_X.replace(to_replace='bd', value=np.nan)
tdf_X.columns

Index(['C [%]', 'Si [%]', 'S [%]', 'P [%]', 'Mg [%]', 'Mn [%]', 'Ni [%]',
       'Cu [%]', 'Mo [%]', 'Cr [%]', 'Al [%]', 'Sn [%]', 'B [%]', 'V [%]',
       'Graphite emission [mm-2]', 'Graphite emission share [%]',
       'Spheroid diameter [μm]', 'Nodularity [%]', 'Perlite share [%]',
       'Ferrite share [%]', 'Rm [MPa] (as cast)', 'A5 [%]     (as cast)',
       'Austenitization temp [C]', 'Austenitization time [min.]',
       'Isothermal process temp [C]', 'Isothermal process time [min.]',
       'Min wall thickness [mm]'],
      dtype='object')

## Imputacja brakujących danych / Missing data imputation

In [11]:
# from sklearn.impute import SimpleImputer

# SI_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# SI_median = SimpleImputer(missing_values=np.nan, strategy='median')


# df_X['C [%]'] = SI_mean.fit_transform(df_X['C [%]'].values.reshape(-1,1))
# tdf_X['C [%]'] = SI_mean.transform(tdf_X['C [%]'].values.reshape(-1,1))

# df_X['Si [%]'] = SI_mean.fit_transform(df_X['Si [%]'].values.reshape(-1,1))
# tdf_X['Si [%]'] = SI_mean.transform(tdf_X['Si [%]'].values.reshape(-1,1))

# df_X['S [%]'] = SI_median.fit_transform(df_X['S [%]'].values.reshape(-1,1))
# tdf_X['S [%]'] = SI_median.transform(tdf_X['S [%]'].values.reshape(-1,1))

# df_X['P [%]'] = SI_median.fit_transform(df_X['P [%]'].values.reshape(-1,1))
# tdf_X['P [%]'] = SI_median.transform(tdf_X['P [%]'].values.reshape(-1,1))

# df_X['Mg [%]'] = SI_median.fit_transform(df_X['Mg [%]'].values.reshape(-1,1))
# tdf_X['Mg [%]'] = SI_median.transform(tdf_X['Mg [%]'].values.reshape(-1,1))

# df_X['Mn [%]'] = SI_median.fit_transform(df_X['Mn [%]'].values.reshape(-1,1))
# tdf_X['Mn [%]'] = SI_median.transform(tdf_X['Mn [%]'].values.reshape(-1,1))

# df_X['Ni [%]'] = SI_median.fit_transform(df_X['Ni [%]'].values.reshape(-1,1))
# tdf_X['Ni [%]'] = SI_median.transform(tdf_X['Ni [%]'].values.reshape(-1,1))


# df_X['Cu [%]'] = SI_median.fit_transform(df_X['Cu [%]'].values.reshape(-1,1))
# tdf_X['Cu [%]'] = SI_median.transform(tdf_X['Cu [%]'].values.reshape(-1,1))


# df_X['Mo [%]'] = SI_median.fit_transform(df_X['Mo [%]'].values.reshape(-1,1))
# tdf_X['Mo [%]'] = SI_median.transform(tdf_X['Mo [%]'].values.reshape(-1,1))


# df_X['Austenitization temp [C]'] = SI_mean.fit_transform(df_X['Austenitization temp [C]'] .values.reshape(-1,1))
# tdf_X['Austenitization temp [C]'] = SI_mean.transform(tdf_X['Austenitization temp [C]'] .values.reshape(-1,1))


# df_X['Austenitization time [min.]'] = SI_median.fit_transform(df_X['Austenitization time [min.]'].values.reshape(-1,1))
# tdf_X['Austenitization time [min.]'] = SI_median.fit_transform(tdf_X['Austenitization time [min.]'].values.reshape(-1,1))

# df_X['Isothermal process temp [C]'] = SI_mean.fit_transform(df_X['Isothermal process temp [C]'].values.reshape(-1,1))
# tdf_X['Isothermal process temp [C]'] = SI_mean.fit_transform(tdf_X['Isothermal process temp [C]'].values.reshape(-1,1))

# df_X['Isothermal process time [min.]'] = SI_median.fit_transform(df_X['Isothermal process time [min.]'].values.reshape(-1,1))
# tdf_X['Isothermal process time [min.]'] = SI_median.fit_transform(tdf_X['Isothermal process time [min.]'].values.reshape(-1,1))

In [12]:
SI_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
SI_median = SimpleImputer(missing_values=np.nan, strategy='median')
iter_imp = IterativeImputer(random_state=0, estimator=RandomForestRegressor())


colt = ColumnTransformer([('mean', SI_mean, ['C [%]', 'Si [%]', 'Austenitization temp [C]', 'Isothermal process temp [C]']), 
                        ('median', SI_median, ['S [%]', 'P [%]', 'Mg [%]', 'Mn [%]', 'Ni [%]', 'Cu [%]', 'Mo [%]', 'Austenitization time [min.]', 'Isothermal process time [min.]']), 
                        ('iter_imp', iter_imp, ['Cr [%]','Al [%]','Sn [%]','B [%]','V [%]','Graphite emission [mm-2]','Graphite emission share [%]','Spheroid diameter [μm]','Nodularity [%]','Perlite share [%]','Ferrite share [%]','Rm [MPa] (as cast)','A5 [%]     (as cast)','Min wall thickness [mm]'])],
                        remainder='passthrough')

colt.set_output(transform="pandas")

In [13]:
df_X_trans = colt.fit_transform(df_X)



In [14]:
tdf_X_trans = colt.transform(tdf_X)

In [15]:
col_list = list(df_X_trans.columns)

## Zmiana nazw kolumn / Renaming columns

In [16]:
res = {col_list[i]: col_list[i] for i in range(len(col_list))}
res

{'mean__C [%]': 'mean__C [%]',
 'mean__Si [%]': 'mean__Si [%]',
 'mean__Austenitization temp [C]': 'mean__Austenitization temp [C]',
 'mean__Isothermal process temp [C]': 'mean__Isothermal process temp [C]',
 'median__S [%]': 'median__S [%]',
 'median__P [%]': 'median__P [%]',
 'median__Mg [%]': 'median__Mg [%]',
 'median__Mn [%]': 'median__Mn [%]',
 'median__Ni [%]': 'median__Ni [%]',
 'median__Cu [%]': 'median__Cu [%]',
 'median__Mo [%]': 'median__Mo [%]',
 'median__Austenitization time [min.]': 'median__Austenitization time [min.]',
 'median__Isothermal process time [min.]': 'median__Isothermal process time [min.]',
 'iter_imp__Cr [%]': 'iter_imp__Cr [%]',
 'iter_imp__Al [%]': 'iter_imp__Al [%]',
 'iter_imp__Sn [%]': 'iter_imp__Sn [%]',
 'iter_imp__B [%]': 'iter_imp__B [%]',
 'iter_imp__V [%]': 'iter_imp__V [%]',
 'iter_imp__Graphite emission [mm-2]': 'iter_imp__Graphite emission [mm-2]',
 'iter_imp__Graphite emission share [%]': 'iter_imp__Graphite emission share [%]',
 'iter_imp__

In [17]:
dict2 = {'mean__C [%]': 'C [%]',
 'mean__Si [%]': 'Si [%]',
 'mean__Austenitization temp [C]': 'Austenitization temp [C]',
 'mean__Isothermal process temp [C]': 'Isothermal process temp [C]',
 'median__S [%]': 'S [%]',
 'median__P [%]': 'P [%]',
 'median__Mg [%]': 'Mg [%]',
 'median__Mn [%]': 'Mn [%]',
 'median__Ni [%]': 'Ni [%]',
 'median__Cu [%]': 'Cu [%]',
 'median__Mo [%]': 'Mo [%]',
 'median__Austenitization time [min.]': 'Austenitization time [min.]',
 'median__Isothermal process time [min.]': 'Isothermal process time [min.]',
 'iter_imp__Cr [%]': 'Cr [%]',
 'iter_imp__Al [%]': 'Al [%]',
 'iter_imp__Sn [%]': 'Sn [%]',
 'iter_imp__B [%]': 'B [%]',
 'iter_imp__V [%]': 'V [%]',
 'iter_imp__Graphite emission [mm-2]': 'Graphite emission [mm-2]',
 'iter_imp__Graphite emission share [%]': 'Graphite emission share [%]',
 'iter_imp__Spheroid diameter [μm]': 'Spheroid diameter [μm]',
 'iter_imp__Nodularity [%]': 'Nodularity [%]',
 'iter_imp__Perlite share [%]': 'Perlite share [%]',
 'iter_imp__Ferrite share [%]': 'Ferrite share [%]',
 'iter_imp__Rm [MPa] (as cast)': 'Rm [MPa] (as cast)',
 'iter_imp__A5 [%]     (as cast)': 'A5 [%]     (as cast)',
 'iter_imp__Min wall thickness [mm]': 'Min wall thickness [mm]'}

In [18]:
df_X_trans.rename(columns=dict2, inplace=True)
df_X_trans.columns

Index(['C [%]', 'Si [%]', 'Austenitization temp [C]',
       'Isothermal process temp [C]', 'S [%]', 'P [%]', 'Mg [%]', 'Mn [%]',
       'Ni [%]', 'Cu [%]', 'Mo [%]', 'Austenitization time [min.]',
       'Isothermal process time [min.]', 'Cr [%]', 'Al [%]', 'Sn [%]', 'B [%]',
       'V [%]', 'Graphite emission [mm-2]', 'Graphite emission share [%]',
       'Spheroid diameter [μm]', 'Nodularity [%]', 'Perlite share [%]',
       'Ferrite share [%]', 'Rm [MPa] (as cast)', 'A5 [%]     (as cast)',
       'Min wall thickness [mm]'],
      dtype='object')

In [19]:
tdf_X_trans.rename(columns=dict2, inplace=True)
tdf_X_trans.columns

Index(['C [%]', 'Si [%]', 'Austenitization temp [C]',
       'Isothermal process temp [C]', 'S [%]', 'P [%]', 'Mg [%]', 'Mn [%]',
       'Ni [%]', 'Cu [%]', 'Mo [%]', 'Austenitization time [min.]',
       'Isothermal process time [min.]', 'Cr [%]', 'Al [%]', 'Sn [%]', 'B [%]',
       'V [%]', 'Graphite emission [mm-2]', 'Graphite emission share [%]',
       'Spheroid diameter [μm]', 'Nodularity [%]', 'Perlite share [%]',
       'Ferrite share [%]', 'Rm [MPa] (as cast)', 'A5 [%]     (as cast)',
       'Min wall thickness [mm]'],
      dtype='object')

## Zapis danych / Saving the transformed data

### Rm

In [20]:
y=df_y.copy()
y_new = y[~y['Rm [MPa]'].isnull()]
wiersze=y_new.index 
x_new=df_X_trans.iloc[wiersze]
x_new['y_RM'] = y_new['Rm [MPa]']

import csv

field = [
    "C [%]",
    "Si [%]",
    "S [%]",
    "P [%]",
    "Mg [%]",
    "Mn [%]",
    "Ni [%]",
    "Cu [%]",
    "Mo [%]",
    "Cr [%]",
    "Al [%]",
    "Sn [%]",
    "B [%]",
    "V [%]",
    "Graphite emission [mm-2]",
    "Graphite emission share [%]",
    "Nodularity [%]",
    "Perlite share [%]",
    "Ferrite share [%]",
    "Rm [MPa] (as cast)",
    "A5 [%]     (as cast)",
    "Min wall thickness [mm]",
    "Austenitization temp [C]",
    "Austenitization time [min.]",
    "Isothermal process temp [C]",
    "Isothermal process time [min.]",
    "y_RM"]
file_name = 'transdata_1.csv'
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(field)
    for index, row in x_new.iterrows():
        values = [row[column] for column in field]
        writer.writerow(values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_new['y_RM'] = y_new['Rm [MPa]']


In [21]:
y=tdf_y.copy()
y_new = y[~y['Rm [MPa]'].isnull()]
wiersze=y_new.index 
x_new=tdf_X_trans.iloc[wiersze]
x_new['y_RM'] = y_new['Rm [MPa]']

import csv

field = [
    "C [%]",
    "Si [%]",
    "S [%]",
    "P [%]",
    "Mg [%]",
    "Mn [%]",
    "Ni [%]",
    "Cu [%]",
    "Mo [%]",
    "Cr [%]",
    "Al [%]",
    "Sn [%]",
    "B [%]",
    "V [%]",
    "Graphite emission [mm-2]",
    "Graphite emission share [%]",
    "Nodularity [%]",
    "Perlite share [%]",
    "Ferrite share [%]",
    "Rm [MPa] (as cast)",
    "A5 [%]     (as cast)",
    "Min wall thickness [mm]",
    "Austenitization temp [C]",
    "Austenitization time [min.]",
    "Isothermal process temp [C]",
    "Isothermal process time [min.]",
    "y_RM"]
file_name = 'train__transdata_1.csv'
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(field)
    for index, row in x_new.iterrows():
        values = [row[column] for column in field]
        writer.writerow(values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_new['y_RM'] = y_new['Rm [MPa]']


### Charpy

In [22]:
y=df_y.copy()
y_new = y[~y['Charpy impact strength [J]'].isnull()]
wiersze=y_new.index 
x_new=df_X_trans.iloc[wiersze]
x_new['y_C'] = y_new['Charpy impact strength [J]']

import csv

field = [
    "C [%]",
    "Si [%]",
    "S [%]",
    "P [%]",
    "Mg [%]",
    "Mn [%]",
    "Ni [%]",
    "Cu [%]",
    "Mo [%]",
    "Cr [%]",
    "Al [%]",
    "Sn [%]",
    "B [%]",
    "V [%]",
    "Graphite emission [mm-2]",
    "Graphite emission share [%]",
    "Nodularity [%]",
    "Perlite share [%]",
    "Ferrite share [%]",
    "Rm [MPa] (as cast)",
    "A5 [%]     (as cast)",
    "Min wall thickness [mm]",
    "Austenitization temp [C]",
    "Austenitization time [min.]",
    "Isothermal process temp [C]",
    "Isothermal process time [min.]",
    "y_C"]
file_name = 'transdata_2.csv'
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(field)
    for index, row in x_new.iterrows():
        values = [row[column] for column in field]
        writer.writerow(values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_new['y_C'] = y_new['Charpy impact strength [J]']


In [23]:
y=tdf_y.copy()
y_new = y[~y['Charpy impact strength [J]'].isnull()]
wiersze=y_new.index 
x_new=tdf_X_trans.iloc[wiersze]
x_new['y_C'] = y_new['Charpy impact strength [J]']

import csv

field = [
    "C [%]",
    "Si [%]",
    "S [%]",
    "P [%]",
    "Mg [%]",
    "Mn [%]",
    "Ni [%]",
    "Cu [%]",
    "Mo [%]",
    "Cr [%]",
    "Al [%]",
    "Sn [%]",
    "B [%]",
    "V [%]",
    "Graphite emission [mm-2]",
    "Graphite emission share [%]",
    "Nodularity [%]",
    "Perlite share [%]",
    "Ferrite share [%]",
    "Rm [MPa] (as cast)",
    "A5 [%]     (as cast)",
    "Min wall thickness [mm]",
    "Austenitization temp [C]",
    "Austenitization time [min.]",
    "Isothermal process temp [C]",
    "Isothermal process time [min.]",
    "y_C"]
file_name = 'train__transdata_2.csv'
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(field)
    for index, row in x_new.iterrows():
        values = [row[column] for column in field]
        writer.writerow(values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_new['y_C'] = y_new['Charpy impact strength [J]']
