In [1]:
import pandas as pd
import numpy as np
from numpy.random import randn

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_importance
from lightgbm import LGBMClassifier
import xgboost as xgb
from lightgbm import plot_importance
from lightgbm import plot_importance

from scipy.stats import chi2_contingency
import scipy.stats as stats 
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.api import qqplot, add_constant
from statsmodels.api import Logit
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.proportion import proportions_ztest

from subprocess import call
from IPython.display import Image
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from yellowbrick.cluster import SilhouetteVisualizer

import graphviz
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.inspection import permutation_importance

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import silhouette_samples, silhouette_score
from imblearn.over_sampling import SMOTE

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import warnings
warnings.filterwarnings('ignore')

matplotlib.rc('font', family = 'NanumBarunGothic')
matplotlib.rc('axes', unicode_minus = False)

In [2]:
df_merged = pd.read_csv("/home/piai/merged_with_2.csv", encoding = 'euc-kr')

In [3]:
df_merged.shape

(14760, 57)

In [4]:
df_merged['Ox_Chamber'] = df_merged['Ox_Chamber'].astype(str)
df_merged['Lot_Num'] = df_merged['Lot_Num'].astype(str)
df_merged['Wafer_Num'] = df_merged['Wafer_Num'].astype(str)

df_merged['photo_soft_Chamber'] = df_merged['photo_soft_Chamber'].astype(str)

df_merged['lithography_Chamber'] = df_merged['lithography_Chamber'].astype(str)
df_merged['Wavelength'] = df_merged['Wavelength'].astype(str)

df_merged['Etching_Chamber'] = df_merged['Etching_Chamber'].astype(str)

# Flux480s, Flux840s, RTA_Temp 이산형 확인 .astype('category')
df_merged['Chamber_Num'] = df_merged['Chamber_Num'].astype(str)

df_merged['wafer_defect'] = df_merged['wafer_defect'].astype('category')
df_merged['Line_CD_state'] = df_merged['Line_CD_state'].astype('category')
df_merged['thickness_state'] = df_merged['thickness_state'].astype('category')

In [5]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14760 entries, 0 to 14759
Data columns (total 57 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Ox_Chamber            14760 non-null  object  
 1   type                  14760 non-null  object  
 2   Temp_OXid             14760 non-null  float64 
 3   Vapor                 14760 non-null  object  
 4   ppm                   14760 non-null  float64 
 5   Pressure              14760 non-null  float64 
 6   Oxid_time             14760 non-null  int64   
 7   thickness             14760 non-null  float64 
 8   Lot_Num               14760 non-null  object  
 9   Wafer_Num             14760 non-null  object  
 10  photo_soft_Chamber    14760 non-null  object  
 11  resist_target         14760 non-null  float64 
 12  N2_HMDS               14760 non-null  float64 
 13  pressure_HMDS         14760 non-null  float64 
 14  temp_HMDS             14760 non-null  float64 
 15  te

In [6]:
# 숫자형 피처만 필터링
numerical_features = df_merged.select_dtypes(include=['int64', 'float64']).columns

anova_results = {}
for feature in numerical_features:
    feature_values = df_merged[feature]
    
    p_values = []
    for other_feature in numerical_features:
        if other_feature != feature:
            other_values = df_merged[other_feature]
            try:
                # ANOVA 수행
                f_stat, p_value = f_oneway(feature_values, other_values)
                p_values.append((min(feature, other_feature), max(feature, other_feature), p_value))
            except Exception as e:
                print(f"Error processing feature {feature} with {other_feature}: {e}")
                p_values.append((min(feature, other_feature), max(feature, other_feature), None))
    
    # 피처별 ANOVA 결과를 데이터프레임에 추가
    anova_results[feature] = p_values

# 유의하지 않은 결과만 필터링 (p-value > 0.05)
non_significant_results = []
for feature, results in anova_results.items():
    for f1, f2, p_value in results:
        if p_value is not None and p_value > 0.05:
            non_significant_results.append((f1, f2, p_value))

# 중복 제거 (정렬된 피처 쌍에 대한 중복 제거)
non_significant_results = list(set(non_significant_results))

# 결과를 p-value 기준으로 오름차순 정렬
non_significant_results_sorted = sorted(non_significant_results, key=lambda x: x[2])

# 결과 데이터프레임 생성 및 출력
non_significant_df = pd.DataFrame(non_significant_results_sorted, columns=['Feature 1', 'Feature 2', 'p-value'])
print("\nANOVA - Non-significant Results (p-value > 0.05):")
print(non_significant_df)


ANOVA - Non-significant Results (p-value > 0.05):
  Feature 1          Feature 2   p-value
0    Target  Temp_implantation  0.255976


In [7]:
# 숫자형 피처만 필터링
numerical_features = df_merged.select_dtypes(include=['int64', 'float64']).columns

t_test_results = {}
for feature in numerical_features:
    feature_values = df_merged[feature]
    
    p_values = []
    for other_feature in numerical_features:
        if other_feature != feature:
            other_values = df_merged[other_feature]
            try:
                # t-검정 수행
                t_stat, p_value = ttest_ind(feature_values, other_values, nan_policy='omit')
                p_values.append((min(feature, other_feature), max(feature, other_feature), p_value))
            except Exception as e:
                print(f"Error processing feature {feature} with {other_feature}: {e}")
                p_values.append((min(feature, other_feature), max(feature, other_feature), None))
    
    # 피처별 t-검정 결과를 데이터프레임에 추가
    t_test_results[feature] = p_values

# 유의하지 않은 결과만 필터링 (p-value > 0.05)
non_significant_results = []
for feature, results in t_test_results.items():
    for f1, f2, p_value in results:
        if p_value is not None and p_value > 0.05:
            non_significant_results.append((f1, f2, p_value))

# 중복 제거 (정렬된 피처 쌍에 대한 중복 제거)
non_significant_results = list(set(non_significant_results))

# 결과를 p-value 기준으로 오름차순 정렬
non_significant_results_sorted = sorted(non_significant_results, key=lambda x: x[2])

# 결과 데이터프레임 생성 및 출력
non_significant_df = pd.DataFrame(non_significant_results_sorted, columns=['Feature 1', 'Feature 2', 'p-value'])
print("\nT-test - Non-significant Results (p-value > 0.05) sorted by p-value:")
print(non_significant_df)


T-test - Non-significant Results (p-value > 0.05) sorted by p-value:
  Feature 1          Feature 2   p-value
0    Target  Temp_implantation  0.255976


In [8]:
def cramers_v(confusion_matrix):
    chi2_stat, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    k = min(confusion_matrix.shape) - 1
    return np.sqrt(chi2_stat / (n * k))

# 범주형 피처만 필터링
categorical_features = df_merged.select_dtypes(include=['object', 'category']).columns

cramers_v_results = []
for feature in categorical_features:
    feature_values = df_merged[feature].dropna()
    
    for other_feature in categorical_features:
        if other_feature != feature:
            other_values = df_merged[other_feature].dropna()
            
            # 교차표 생성
            contingency_table = pd.crosstab(feature_values, other_values)
            
            if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
                try:
                    # Cramér's V 계산
                    v = cramers_v(contingency_table)
                    cramers_v_results.append((feature, other_feature, v))
                except Exception as e:
                    print(f"Error processing feature {feature} with {other_feature}: {e}")
                    cramers_v_results.append((feature, other_feature, None))
            else:
                cramers_v_results.append((feature, other_feature, None))

# 결과를 Cramér's V 값 기준으로 오름차순 정렬
cramers_v_df = pd.DataFrame(cramers_v_results, columns=['Feature 1', 'Feature 2', 'Cramér\'s V'])
cramers_v_df = cramers_v_df.dropna(subset=['Cramér\'s V'])
cramers_v_df['Feature Pair'] = cramers_v_df.apply(lambda x: tuple(sorted([x['Feature 1'], x['Feature 2']])), axis=1)
cramers_v_df = cramers_v_df.drop_duplicates(subset=['Feature Pair'])
cramers_v_df = cramers_v_df.sort_values(by='Cramér\'s V')

print("\nCramér's V - Results sorted by Cramér's V:")
print(cramers_v_df)


Cramér's V - Results sorted by Cramér's V:
              Feature 1            Feature 2  Cramér's V  \
195       Line_CD_state      thickness_state    0.001908   
79   photo_soft_Chamber          Chamber_Num    0.011383   
78   photo_soft_Chamber      Etching_Chamber    0.011383   
33                Vapor  lithography_Chamber    0.014439   
19                 type  lithography_Chamber    0.014439   
..                  ...                  ...         ...   
65            Wafer_Num          Chamber_Num    0.726066   
15                 type                Vapor    0.999861   
135     Etching_Chamber          Chamber_Num    1.000000   
105             UV_type           Wavelength    1.000000   
165       Error_message         wafer_defect    1.000000   

                              Feature Pair  
195       (Line_CD_state, thickness_state)  
79       (Chamber_Num, photo_soft_Chamber)  
78   (Etching_Chamber, photo_soft_Chamber)  
33            (Vapor, lithography_Chamber)  
19        

In [9]:
df_merged.drop(columns=['Temp_implantation', 'photo_soft_Chamber', 'Line_CD_state', 'Chamber_Num', 'Etching_Chamber', 'Defective_Rate_wafer', 'Error_message', 'Vapor', 'type'], inplace=True)

In [10]:
df_char=df_merged.select_dtypes(include="object")
df_numeric=df_merged.select_dtypes(exclude="object")

scaler = StandardScaler()
np_numeric_scaled = scaler.fit_transform(df_numeric)
df_numeric_scaled = pd.DataFrame(np_numeric_scaled, columns=df_numeric.columns)
df_encoded = pd.get_dummies(df_char, drop_first=True)

df_final = df_numeric_scaled.join(df_encoded)

In [11]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14760 entries, 0 to 14759
Columns: 134 entries, Temp_OXid to Wavelength_436
dtypes: float64(42), uint8(92)
memory usage: 6.0 MB


In [12]:
df_train, df_test = train_test_split(df_final, test_size = 0.3, random_state = 1234) 
print('train data size : {}'.format(df_train.shape)) 
print('test data size : {}'.format(df_test.shape))

train data size : (10332, 134)
test data size : (4428, 134)


In [13]:
# 특성과 타겟 변수 정의
# 여기서는 'target'이 회귀를 수행할 타겟 변수를 나타낸다고 가정합니다.
X = df_merged.drop(columns='Target')  # 'target' 열을 실제 타겟 변수 이름으로 바꿉니다.
y = df_merged['Target']  # 'target' 열을 실제 타겟 변수 이름으로 바꿉니다.

# 상수 항 추가 (절편을 포함시키기 위해)
X = sm.add_constant(X)

# OLS 회귀 모델 생성
model = sm.OLS(y, X)

# 모델 학습
results = model.fit()

# 회귀 분석 결과 출력
print(results.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data). The types seen wereNone and const                   float64
Ox_Chamber               object
Temp_OXid               float64
ppm                     float64
Pressure                float64
Oxid_time                 int64
thickness               float64
Lot_Num                  object
Wafer_Num                object
resist_target           float64
N2_HMDS                 float64
pressure_HMDS           float64
temp_HMDS               float64
temp_HMDS_bake          float64
time_HMDS_bake          float64
spin1                   float64
spin2                   float64
spin3                   float64
photoresist_bake        float64
temp_softbake           float64
time_softbake           float64
lithography_Chamber      object
Line_CD                 float64
UV_type                  object
Wavelength               object
Resolution              float64
Energy_Exposure         float64
Thin F4                 float64
Thin F3                 float64
Thin F2                 float64
Thin F1                 float64
Temp_Etching            float64
Source_Power            float64
Selectivity             float64
Flux60s                 float64
Flux90s                 float64
Flux160s                float64
Flux480s                float64
input_Energy            float64
Furance_Temp            float64
RTA_Temp                  int64
merged_Chamber            int64
path                      int64
wafer_defect           category
Month                     int64
IsWeekend                 int64
thickness_state        category
Defective_Rate_chip     float64
dtype: object. The data was
0        141
1         55
2         96
3        105
4         79
        ... 
14755    134
14756    129
14757     78
14758     60
14759     74
Name: Target, Length: 14760, dtype: int64
and
        const Ox_Chamber  Temp_OXid    ppm  Pressure  Oxid_time  thickness  \
0        1.0          2    1214.33  26.78      0.30        120     712.98   
1        1.0          2     977.98  30.90      0.14        137     714.41   
2        1.0          2    1175.95  31.11      0.25        116     710.27   
3        1.0          2     933.47  31.20      0.29        143     710.62   
4        1.0          2    1140.60  31.38      0.20         76     711.70   
...      ...        ...        ...    ...       ...        ...        ...   
14755    1.0          3    1119.37  27.37      0.13         50     711.92   
14756    1.0          3    1263.81  31.62      0.18        112     696.98   
14757    1.0          3     964.04  38.56      0.08         78     706.11   
14758    1.0          3    1058.80  32.84      0.38         80     707.33   
14759    1.0          3    1046.03  44.90      0.31         89     715.99   

      Lot_Num Wafer_Num  resist_target  ...  input_Energy  Furance_Temp  \
0          13        28          1.426  ...     30795.856         854.0   
1          13        29          0.730  ...     32135.659         895.0   
2          13        30          0.903  ...     31057.876         898.0   
3          13        31          0.510  ...     32140.435         879.0   
4          13        32          1.696  ...     31985.989         882.0   
...       ...       ...            ...  ...           ...           ...   
14755      13        23          1.154  ...     31877.793         914.0   
14756      13        24          0.876  ...     30107.476         899.0   
14757      13        25          0.485  ...     32048.298         903.0   
14758      13        26          1.672  ...     31221.150         879.0   
14759      13        27          1.202  ...     31645.170         890.0   

       RTA_Temp  merged_Chamber  path  wafer_defect  Month  IsWeekend  \
0           154            2111    28             0      1          1   
1           156            2112    29             0      1          1   
2           152            2113    30             0      1          1   
3           155            2111    28             0      1          1   
4           155            2112    29             0      1          1   
...         ...             ...   ...           ...    ...        ...   
14755       155            3222    68             0      6          0   
14756       156            3231    70             0      6          0   
14757       157            3311    73             0      6          0   
14758       155            3322    77             0      6          0   
14759       157            3333    81             0      6          0   

       thickness_state  Defective_Rate_chip  
0                    0               0.0705  
1                    0               0.0275  
2                    0               0.0480  
3                    0               0.0525  
4                    0               0.0395  
...                ...                  ...  
14755                0               0.0670  
14756                1               0.0645  
14757                0               0.0390  
14758                0               0.0300  
14759                0               0.0370  

[14760 rows x 48 columns]
before. After,
[141  55  96 ...  78  60  74]
[[1.0 '2' 1214.33 ... 1 0 0.0705]
 [1.0 '2' 977.98 ... 1 0 0.0275]
 [1.0 '2' 1175.95 ... 1 0 0.048]
 ...
 [1.0 '3' 964.04 ... 0 0 0.039]
 [1.0 '3' 1058.8 ... 0 0 0.03]
 [1.0 '3' 1046.03 ... 0 0 0.037]].