### 01 DAEUN

In [68]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler


# 파일 읽기
input_file = "C:\\Users\\kdecs\\Desktop\\VSCode\\dekmidterms\\1_adults.csv"
# 대부분 파일은 엔코딩 cp949, utf-8을 씀
def read_file(file_path, encoding='utf-8'):
    df = pd.read_csv(file_path, encoding='utf-8')
    df_original = df.copy() # 데이터 원본 보관
    return df, df_original

df, df_original = read_file(input_file)

# 데이터프레임 분석 및 확인
df.info()
df.describe()
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [69]:
#데이터 중복치 제거
def remove_dup(df):
    # 열 기준 중복치 제거
    df = df.drop_duplicates()
    return df

df = remove_dup(df)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [70]:
# 데이터 결측치 개수 확인 - 딕셔너리를 만들어 각 콜럼과 결측치 개수를 맵핑핑
def count_nan(df):
    count_nan_dict = {}
    for col in df:
        count_nan_dict[col] = df[col].isnull().sum()
    return count_nan_dict

nan_dict = count_nan(df)
print(nan_dict)


{'age': np.int64(0), 'workclass': np.int64(0), 'fnlwgt': np.int64(0), 'education': np.int64(0), 'education.num': np.int64(0), 'marital.status': np.int64(0), 'occupation': np.int64(0), 'relationship': np.int64(0), 'race': np.int64(0), 'sex': np.int64(0), 'capital.gain': np.int64(0), 'capital.loss': np.int64(0), 'hours.per.week': np.int64(0), 'native.country': np.int64(0), 'income': np.int64(0)}


In [71]:
# 문자 결측치를 NaN으로 처리하는 코드
def change_nan(df):
    df.replace(['ERROR', 'UNKNOWN', '-', 'N/A', '?'], np.nan, inplace=True)
    return df

df_nodup = change_nan(df)

df_nodup.tail()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [72]:
# 결측치가 너무 많은 행 & 모든 값이 같은 행, 그리고 모든 값이 다른 행 (ID등) 제거
def remove_nan_col(df, target):
    col_remove = []

    for col in df:
        emp_row = df[col].isna().sum() # 행의 결측치 개수
        total_row = df.shape[0] 
        unique_no = df[col].nunique() # 행의 고유값
        print(col, (emp_row/total_row))
        # 결측치 개수가 총 행의 50%가 넘으면 제거 --> 유의미한 데이터 추출 어려움
        if ((emp_row / total_row) * 100) >= 50:
            col_remove.append(col)
        # 행의 모든 값이 똑같은 경우도 제거
        elif unique_no == 1:
            col_remove.append(col)
        # 행의 모든 값이 다를 경우도 제거 --> ID와 같은 값들임
        elif ((unique_no/total_row) * 100) >= 99:
            col_remove.append(col)
    df_filtered = df.drop(columns=col_remove)
    df_filtered = df_filtered.drop(target, axis=1)
    df_target = df[target]
    # 제거된 행들의 이름을 담은 리스트도 리턴
    return df_filtered, col_remove, df_target

df, col_remove, df_target = remove_nan_col(df_nodup, 'income')
# print(col_remove)
# df.head()
# df. columns
df_target.head()

age 0.0
workclass 0.05642806650889756
fnlwgt 0.0
education 0.0
education.num 0.0
marital.status 0.0
occupation 0.05664320619602299
relationship 0.0
race 0.0
sex 0.0
capital.gain 0.0
capital.loss 0.0
hours.per.week 0.0
native.country 0.017887328272428313
income 0.0


0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: income, dtype: object

In [73]:
# 행 분류 (연속형, 범주형) & 데이터타입 변환
def num_cat_col_classifier(df, threshold):
    num_col_list = []  # 연속형
    cat_col_list = []  # 범주형 

    for col in df.columns:
        no_unique = df[col].nunique()
        print(col, no_unique)
        # 행 이름에 날짜가 들어가면 날짜값으로 변환 시도 + 연속형 행에 추가
        if "date" in col.lower():
                df[col] = pd.to_datetime(df[col], errors='raise')
                # 날짜값을 인코딩시 부정확한 데이터가 생성되어 1970-01-01을 기준으로 일수로 변환
                # 날짜값 사이의 기간을 유의미한 데이터로 변환 가능
                df[col] = (df[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')
                num_col_list.append(col)  
        #고유값이 지정한 개수보다 많으면 연속형 범주로 포함 --> 데이터를 보고 threshold 설정
        elif no_unique >= threshold:
            try:
                # 숫자인데 데이터타입이 float가 아닌 경우 숫자로 변환
                df[col] = pd.to_numeric(df[col], errors='raise') 
                num_col_list.append(col)
            except Exception:
                # 숫자 변환에 에러 발생 시 멈추고 범주형 데이터에 추가
                cat_col_list.append(col)
        else:
            cat_col_list.append(col)

    return num_col_list, cat_col_list

num_list, cat_list = num_cat_col_classifier(df, 4)
num_df = df[num_list]
cat_df = df[cat_list]

print(num_df.head())
num_df.info()

print(cat_df.head())

print(num_df.dtypes)
print(cat_df.dtypes)



age 73
workclass 8
fnlwgt 21648
education 16
education.num 16
marital.status 7
occupation 14
relationship 6
race 5
sex 2
capital.gain 119
capital.loss 92
hours.per.week 94
native.country 41
   age  fnlwgt  education.num  capital.gain  capital.loss  hours.per.week
0   90   77053              9             0          4356              40
1   82  132870              9             0          4356              18
2   66  186061             10             0          4356              40
3   54  140359              4             0          3900              40
4   41  264663             10             0          3900              40
<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32537 non-null  int64
 1   fnlwgt          32537 non-null  int64
 2   education.num   32537 non-null  int64
 3   capital.gain    32537 non-null  int64
 4   

In [74]:
le = LabelEncoder()

#범주형 데이터 인코딩 함수
def label_cat_col(df):
    for col in df:
        df.loc[:, col] = df[col].fillna(df[col].mode()[0])
        # 고유값이 지정한 값보다 많은 경우 labelencoder 실행 --> 차원의 저주 방지
        if df[col].nunique() >= 5:
            df.loc[:, col] = le.fit_transform(df[col])
        # 고유값의 개수가 적으면 onehot encoding 실행 
        else:
            dummies = pd.get_dummies(df[col], prefix=col).astype(int)
            df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
    return df

cat_df = label_cat_col(cat_df)
print(cat_df.head())
print(cat_df)
print(num_df.isna().sum())



  workclass education marital.status occupation relationship race  \
0         3        11              6          9            1    4   
1         3        11              6          3            1    4   
2         3        15              6          9            4    2   
3         3         5              0          6            4    4   
4         3        15              5          9            3    4   

  native.country  sex_Female  sex_Male  
0             38           1         0  
1             38           1         0  
2             38           1         0  
3             38           1         0  
4             38           1         0  
      workclass education marital.status occupation relationship race  \
0             3        11              6          9            1    4   
1             3        11              6          3            1    4   
2             3        15              6          9            4    2   
3             3         5              0       

In [75]:
# 이상치 탐지 & 제거 - IQR 방식
def check_outlier(df, threshold=1.5):
    for col in df:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        # IQR 값으로 생성한 상한선과 하한선을 데이터에 clip으로 적용
        df[col].clip(lower_bound, upper_bound)
    return df

num_df = check_outlier(num_df)
print(num_df.head())


   age  fnlwgt  education.num  capital.gain  capital.loss  hours.per.week
0   90   77053              9             0          4356              40
1   82  132870              9             0          4356              18
2   66  186061             10             0          4356              40
3   54  140359              4             0          3900              40
4   41  264663             10             0          3900              40


In [76]:
# 데이터 형태에 따라 정규화 혹은 표준화 적용
# 연속형 데이터 정규화
def num_normalized(df):
    for col in df:
        mms = MinMaxScaler()
        df.loc[:, col] = mms.fit_transform(df[[col]])
    return df

normal_num_df = num_normalized(num_df)
print(normal_num_df)


# 연속형 데이터 표준화
"""
def num_standardized(df):
    for col in df:
        ss = StandardScaler()
        df.loc[:, col] = ss.fit_transform(df[[col]])
    return df
standard_num_df = num_standardized(num_df)
print(standard_num_df)
"""

            age    fnlwgt  education.num  capital.gain  capital.loss  \
0      1.000000  0.043987       0.533333           0.0      1.000000   
1      0.890411  0.081896       0.533333           0.0      1.000000   
2      0.671233  0.118021       0.600000           0.0      1.000000   
3      0.506849  0.086982       0.200000           0.0      0.895317   
4      0.328767  0.171404       0.600000           0.0      0.895317   
...         ...       ...            ...           ...           ...   
32556  0.068493  0.202298       0.600000           0.0      0.000000   
32557  0.136986  0.166404       0.733333           0.0      0.000000   
32558  0.315068  0.096500       0.533333           0.0      0.000000   
32559  0.561644  0.094827       0.533333           0.0      0.000000   
32560  0.068493  0.128499       0.533333           0.0      0.000000   

       hours.per.week  
0            0.397959  
1            0.173469  
2            0.397959  
3            0.397959  
4            0.

  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])


'\ndef num_standardized(df):\n    for col in df:\n        ss = StandardScaler()\n        df.loc[:, col] = ss.fit_transform(df[[col]])\n    return df\nstandard_num_df = num_standardized(num_df)\nprint(standard_num_df)\n'

In [77]:
# 범주형과 연속형 결합하기
def add_dfs(df1, df2):
    final_df = pd.concat([df1, df2], axis=1)
    return final_df

final_df = add_dfs(normal_num_df, cat_df)
final_df.head()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass,education,marital.status,occupation,relationship,race,native.country,sex_Female,sex_Male
0,1.0,0.043987,0.533333,0.0,1.0,0.397959,3,11,6,9,1,4,38,1,0
1,0.890411,0.081896,0.533333,0.0,1.0,0.173469,3,11,6,3,1,4,38,1,0
2,0.671233,0.118021,0.6,0.0,1.0,0.397959,3,15,6,9,4,2,38,1,0
3,0.506849,0.086982,0.2,0.0,0.895317,0.397959,3,5,0,6,4,4,38,1,0
4,0.328767,0.171404,0.6,0.0,0.895317,0.397959,3,15,5,9,3,4,38,1,0


In [None]:
# CSV파일로 저장
def save_csv(df):
    #final_df = df.to_csv(f'output_file.csv', index=False)
    return final_df

save_csv(final_df)


In [82]:
# 위 모든 함수를 실행하는 함수 생성
def some_function(input_file):
    df, df_original = read_file(input_file)
    df = remove_dup(df)
    df_nodup = change_nan(df)
    df, col_remove, df_target = remove_nan_col(df_nodup, 'income')
    num_list, cat_list = num_cat_col_classifier(df, 4)
    num_df = df[num_list]
    cat_df = df[cat_list]
    cat_df = label_cat_col(cat_df)
    num_df = check_outlier(num_df)
    normal_num_df = num_normalized(num_df)
    # standard_num_df = num_standardized(num_df)
    num_col_df = add_dfs(normal_num_df, cat_df)
    final_df = add_dfs(num_col_df, df_target)
    output_file = save_csv(final_df)
    return output_file

output_file = some_function(input_file)

age 0.0
workclass 0.05642806650889756
fnlwgt 0.0
education 0.0
education.num 0.0
marital.status 0.0
occupation 0.05664320619602299
relationship 0.0
race 0.0
sex 0.0
capital.gain 0.0
capital.loss 0.0
hours.per.week 0.0
native.country 0.017887328272428313
income 0.0
age 73
workclass 8
fnlwgt 21648
education 16
education.num 16
marital.status 7
occupation 14
relationship 6
race 5
sex 2
capital.gain 119
capital.loss 92
hours.per.week 94
native.country 41


  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])


In [83]:
input_file = "C:\\Users\\kdecs\\Desktop\\VSCode\\dekmidterms\\1_adults.csv"
output_file = some_function(input_file)


age 0.0
workclass 0.05642806650889756
fnlwgt 0.0
education 0.0
education.num 0.0
marital.status 0.0
occupation 0.05664320619602299
relationship 0.0
race 0.0
sex 0.0
capital.gain 0.0
capital.loss 0.0
hours.per.week 0.0
native.country 0.017887328272428313
income 0.0
age 73
workclass 8
fnlwgt 21648
education 16
education.num 16
marital.status 7
occupation 14
relationship 6
race 5
sex 2
capital.gain 119
capital.loss 92
hours.per.week 94
native.country 41


  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])
  df.loc[:, col] = mms.fit_transform(df[[col]])


In [15]:
"""
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('label_encoder', LabelEncoder())
])
col_transfomer = ColumnTransformer(transformers = [
    ('num_pipeline', num_pipeline, num_col_classifier(df)),
    ('cat_pipeline', cat_pipeline, cat_col_classifier(df))
])

steps = [
    ('remove_dup', FunctionTransformer(remove_dup, validate=True)),
    ('count_nan', FunctionTransformer(count_nan, validate=True)),
    ('remove_nan_col', FunctionTransformer(remove_nan_col, validate=True)),
    ('col_classifier', FunctionTransformer(num_col_classifier, validate=True)),
    ('col_classifier', FunctionTransformer(cat_col_classifier, validate=True)),
    ('column_transform', col_transfomer)
]

pipe_final = Pipeline(steps)
preprocessed_df = pipe_final.fit_transform(df)

output_file = 'preprocessed_data.csv'
preprocessed_df.to_csv(output_file, index=False)

print("Preprocessed DataFrame:")
print(preprocessed_df)
print(f"\nSaved to {output_file}")
"""

'\nnum_pipeline = Pipeline(steps=[\n    (\'impute\', SimpleImputer(strategy=\'median\')),\n    (\'scale\', StandardScaler())\n])\n\ncat_pipeline = Pipeline(steps=[\n    (\'impute\', SimpleImputer(strategy=\'most_frequent\')),\n    (\'label_encoder\', LabelEncoder())\n])\ncol_transfomer = ColumnTransformer(transformers = [\n    (\'num_pipeline\', num_pipeline, num_col_classifier(df)),\n    (\'cat_pipeline\', cat_pipeline, cat_col_classifier(df))\n])\n\nsteps = [\n    (\'remove_dup\', FunctionTransformer(remove_dup, validate=True)),\n    (\'count_nan\', FunctionTransformer(count_nan, validate=True)),\n    (\'remove_nan_col\', FunctionTransformer(remove_nan_col, validate=True)),\n    (\'col_classifier\', FunctionTransformer(num_col_classifier, validate=True)),\n    (\'col_classifier\', FunctionTransformer(cat_col_classifier, validate=True)),\n    (\'column_transform\', col_transfomer)\n]\n\npipe_final = Pipeline(steps)\npreprocessed_df = pipe_final.fit_transform(df)\n\noutput_file = \'pre