# Drive Mount

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Library Definition

In [31]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Preprocess

In [32]:
X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model.csv"
Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
X_test_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_exam.csv"

X_model = pd.read_csv(X_model_file_path)
Y_model = pd.read_csv(Y_model_file_path)
X_test = pd.read_csv(X_test_file_path)

In [33]:
X_model_test = pd.concat([X_model, X_test], axis=0)
X_model_test.head()

Unnamed: 0,gender,age_code,region_code,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,...,t20220817,t20220818,t20220819,t20220820,t20220821,t20220822,t20220823,t20220824,t20220825,t20220826
0,1,13,7,,,,,,,,...,,,,,,,,,,1.0
1,1,5,1,,,,,1.0,,,...,,,,,,,,,,
2,2,6,2,,,,,,,,...,,,,,,,,,,
3,2,1,1,,,,,,,,...,,,,,,,,,,
4,2,5,1,,,,,,,,...,,,,,,,,,,


In [34]:
Y_model.head()

Unnamed: 0,business
0,0
1,0
2,0
3,0
4,0


## Missing Value Handling

In [35]:
X_model_test.info(verbose = True, null_counts = True)

  X_model_test.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 199999
Data columns (total 717 columns):
 #    Column       Non-Null Count    Dtype  
---   ------       --------------    -----  
 0    gender       1000000 non-null  int64  
 1    age_code     1000000 non-null  int64  
 2    region_code  1000000 non-null  int64  
 3    c20220101    118965 non-null   float64
 4    c20220102    114903 non-null   float64
 5    c20220103    183263 non-null   float64
 6    c20220104    164356 non-null   float64
 7    c20220105    180875 non-null   float64
 8    c20220106    158136 non-null   float64
 9    c20220107    154618 non-null   float64
 10   c20220108    111350 non-null   float64
 11   c20220109    109741 non-null   float64
 12   c20220110    195018 non-null   float64
 13   c20220111    167618 non-null   float64
 14   c20220112    162418 non-null   float64
 15   c20220113    159433 non-null   float64
 16   c20220114    166556 non-null   float64
 17   c20220115    117958 non-nu

In [36]:
X_model_test = X_model_test.fillna(0)

assert(X_model_test.isnull().sum().sum() == 0)

## Scaling

In [37]:
columns_scaled = [c for c in X_model_test.columns if c not in ['gender', 'age_code', 'region_code']]
 
scaler = MinMaxScaler()
scaler.fit(X_model_test[columns_scaled])

X_model_test[columns_scaled] = scaler.transform(X_model_test[columns_scaled])
X_model_test.head()

Unnamed: 0,gender,age_code,region_code,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,...,t20220817,t20220818,t20220819,t20220820,t20220821,t20220822,t20220823,t20220824,t20220825,t20220826
0,1,13,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032258
1,1,5,1,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,6,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Imputation

In [38]:
#gender code string encoding
X_model_test['gender'] = X_model_test['gender']-1
X_model_test['gender']

0         0
1         0
2         1
3         1
4         1
         ..
199995    1
199996    0
199997    0
199998    0
199999    1
Name: gender, Length: 1000000, dtype: int64

In [39]:
X_model_test = pd.get_dummies(X_model_test, columns = ['age_code', 'region_code'])
X_model_test.head()

Unnamed: 0,gender,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,c20220108,c20220109,...,region_code_9,region_code_10,region_code_11,region_code_12,region_code_13,region_code_14,region_code_15,region_code_16,region_code_17,region_code_18
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Undersampling

In [40]:
X_model_length = X_model.shape[0]

X_model = X_model_test.iloc[:X_model_length, :]
X_test = X_model_test.iloc[X_model_length:, :]

In [41]:
X_Y_model = pd.concat([X_model, Y_model], axis = 1)
X_Y_model_business =  X_Y_model[X_Y_model['business'] == 1]
X_Y_model_not_business = X_Y_model[X_Y_model['business'] == 0]

X_Y_model_not_business = X_Y_model_not_business.sample(n = X_Y_model_business.shape[0], replace = False, random_state = 100)
X_Y_model_undersampled = pd.concat([X_Y_model_business, X_Y_model_not_business], axis = 0)
X_Y_model_undersampled = X_Y_model_undersampled.sample(frac = 1, replace = False, random_state = 100)

X_model_undersampled = X_Y_model_undersampled.iloc[:, :-1]
Y_model_undersampled = X_Y_model_undersampled.iloc[:, -1]

print(f"X_model sample 개수: {X_model_undersampled.shape[0]}")
print(f"Y_model sample 개수: {Y_model_undersampled.shape[0]}")

X_model sample 개수: 104634
Y_model sample 개수: 104634


In [42]:
X_model_undersampled.to_csv("/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_undersampled.csv")

In [43]:
Y_model_undersampled.to_csv("/content/drive/Shareddrives/Intro-data-science/data/Y_model_undersampled.csv")

In [44]:
X_test.to_csv("/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess.csv")