In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

### Data finding

In [2]:
DATA_DIR = "../data/welding_wave"
filenames = os.listdir(DATA_DIR)
df_list = []
type_list = []
for filename in filenames:
    hz = filename.split("_")[0]
    gap_id = filename.split("_")[1]
    gap_type = gap_id[:4]
#     welding_order = gap_id.split("mm")[-1].split(".")[0]
    
    type_list.append([hz, gap_type])
    df_list.append(
        pd.read_csv(os.path.join(DATA_DIR, filename), delimiter="\t", header=None,
                   names =["time_order", "ampere", "volt"] ))

In [3]:
modifed_df_list = []
for df, (hz, gap_type)in zip(df_list, type_list):
    df["hz"] = hz
    df["gap_type"] = gap_type
    df["gap_type"] = gap_type
    
    modifed_df_list.append(df)
all_df = pd.concat(modifed_df_list).reset_index(drop=True)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X_df = all_df.copy()
X_df
X_df["hz"] = X_df.hz.replace({"2000Hz" : 0, "4000Hz" : 1})
X_df = X_df[['ampere', 'volt', 'hz']]

le = preprocessing.LabelEncoder()
le.fit(all_df.gap_type.unique())
y_df = all_df.gap_type


X = X_df.values
y = le.transform(all_df.gap_type.values) 
X_dev, X_test, y_dev, y_test = train_test_split(X, y)

In [5]:
X_df = all_df.copy()

shift_big_list = []
rolling_big_list = []
# window_size = 11
window_size = 11

for hz in ["2000Hz", "4000Hz"]:
    for gap_type in ["Gap0", "Gap2", "Gap4"]:
        target_df = X_df[(X_df["hz"] == hz) & (X_df["gap_type"] == gap_type)][['ampere', 'volt']]
        shift_small_list = []
        rolling_small_list = []
        for i in range(1, window_size):
            names = target_df.columns
            df = target_df.shift(i)
            df.columns = [name + "_shift_" + str(i) for name in names]
            shift_small_list.append(df)

            af = target_df.rolling(i).mean()
            af.columns = [name + "_rolling_" + str(i) for name in names]
            rolling_small_list.append(af)

        shift_big_list.append(pd.concat(shift_small_list, axis=1))
        rolling_big_list.append(pd.concat(rolling_small_list, axis=1))
        

X_df = pd.merge(X_df, pd.concat(shift_big_list).dropna(), how="inner" , left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.concat(rolling_big_list).dropna(), how="inner" , left_index=True, right_index=True)


X_df["hz"] = X_df.hz.replace({"2000Hz" : 0, "4000Hz" : 1})
X_df["log_am"] = np.log(X_df['ampere'] + 100)
X_df["log_volt"] = np.log(X_df['volt'] + 100)



le = preprocessing.LabelEncoder()
le.fit(X_df.gap_type.unique())
y = le.transform(X_df.pop("gap_type").values) 
X = X_df.values

X_dev, X_test, y_dev, y_test = train_test_split(X, y)

In [6]:
def all_column(X):
    return X

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

In [None]:
def all_column(X):
    return X

In [24]:

combined_features = FeatureUnion([
    ("pca", PCA()),
    ("svd", TruncatedSVD()),
    ("all", FunctionTransformer(all_column)),
])

In [None]:
pipeline = Pipeline([("features", combined_features), ("rf_clf", RandomForestClassifier())])

param_grid = dict(features__pca__n_components=[1, 2, 3,4,5],
                  features__svd__n_components=[5,10,15,20],
                 )

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring="accuracy")
grid_search.fit(X_dev, y_dev)

### Add time-series analysis

In [8]:
X_df = all_df.copy()

shift_big_list = []
rolling_big_list = []
# window_size = 11
window_size = 11

for hz in ["2000Hz", "4000Hz"]:
    for gap_type in ["Gap0", "Gap2", "Gap4"]:
        target_df = X_df[(X_df["hz"] == hz) & (X_df["gap_type"] == gap_type)][['ampere', 'volt']]
        shift_small_list = []
        rolling_small_list = []
        for i in range(1, window_size):
            names = target_df.columns
            df = target_df.shift(i)
            df.columns = [name + "_shift_" + str(i) for name in names]
            shift_small_list.append(df)

            af = target_df.rolling(i).mean()
            af.columns = [name + "_rolling_" + str(i) for name in names]
            rolling_small_list.append(af)

        shift_big_list.append(pd.concat(shift_small_list, axis=1))
        rolling_big_list.append(pd.concat(rolling_small_list, axis=1))
        

X_df = pd.merge(X_df, pd.concat(shift_big_list).dropna(), how="inner" , left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.concat(rolling_big_list).dropna(), how="inner" , left_index=True, right_index=True)


X_df["hz"] = X_df.hz.replace({"2000Hz" : 0, "4000Hz" : 1})
X_df["log_am"] = np.log(X_df['ampere'] + 100)
X_df["log_volt"] = np.log(X_df['volt'] + 100)



le = preprocessing.LabelEncoder()
le.fit(X_df.gap_type.unique())
y = le.transform(X_df.pop("gap_type").values) 
X = X_df.values

X_dev, X_test, y_dev, y_test = train_test_split(X, y)

In [12]:
from sklearn.decomposition import TruncatedSVD 
from sklearn.linear_model import LogisticRegression

data = X_dev
tsvd = TruncatedSVD(n_components=10)
tsvd.fit(data)
data_embedded = tsvd.transform(data)

get_welding_wave_classification_result(
    LogisticRegression(), data_embedded, y_dev
)

(0.4089639368256062, 0.4087673213925072)

In [None]:
get_welding_wave_classification_result(
    LogisticRegression(), X_dev, y_dev
)

In [20]:
X_dev.shape

(686618, 46)

In [21]:
data_embedded.shape

(686618, 10)

In [22]:
X_full = np.concatenate((X_dev, data_embedded), axis=1)
X_full.shape

(686618, 56)

In [33]:
get_welding_wave_classification_result(
    LogisticRegression(), X_full, y_dev
)

(0.4314225661913836, 0.43141018901845946)

In [26]:
from sklearn.ensemble import RandomForestClassifier 

get_welding_wave_classification_result(
    RandomForestClassifier(n_jobs = 6), 
    X_full, y_dev, n_polynomial=1
)

(0.9907160896335871, 0.640581809420044)

In [27]:
from hpsklearn import HyperoptEstimator, decision_tree
from sklearn import svm
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_full, y_dev)

estim = HyperoptEstimator(classifier=decision_tree('myDT'))
estim.fit(X_train, y_train)

print(estim.score(X_val, y_val))

100%|██████████| 1/1 [00:01<00:00,  1.68s/it, best loss: 0.5034031439029837]
100%|██████████| 1/1 [00:53<00:00, 53.70s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:06<00:00,  6.60s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:08<00:00,  8.03s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:07<00:00,  7.74s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:06<00:00,  6.44s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:07<00:00,  7.58s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:06<00:00,  6.54s/it, best loss: 0.16507918013845602]
100%|██████████| 1/1 [00:12<00:00, 12.09s/it, best loss: 0.0007184954317284165]
100%|██████████| 1/1 [00:05<00:00,  5.57s/it, best loss: 0.0007184954317284165]
0.5359878826716379


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_full, y_dev)

import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier( include_estimators=["random_forest", "xgradient_boosting", ], 
    include_preprocessors=["no_preprocessing", ], 
    seed = 42,ml_memory_limit = 30720,
    resampling_strategy = 'holdout',
    n_jobs=8)
                                                          
automl.fit(X_train.copy(), y_train.copy())
y_hat = automl.predict(X_val)
print("Accuracy score", sklearn.metrics.accuracy_score(y_val, y_hat))

  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)








  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)


In [33]:
from sklearn.metrics import accuracy_score
X_train, X_val, y_train, y_val = train_test_split(X_full, y_dev)

print("Accuracy score", accuracy_score(y_val, y_hat))

NameError: name 'X_full' is not defined

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev)

import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier( ml_memory_limit=30720,)
automl.fit(X_train.copy(), y_train.copy())
y_hat = automl.predict(X_val)

In [29]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_val, y_hat))

NameError: name 'sklearn' is not defined

In [35]:
automl.show_models()

NameError: name 'automl' is not defined