### 01. 항공권 가격예측
- 타겟 : Price
- 평가 : RMSE
- 제출 : result.csv (pred)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/flight_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/flight_test.csv')
print(train.shape, test.shape)
display(train.head(3), test.head(3))

(10505, 11) (4502, 10)


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,Vistara,UK-776,Kolkata,Evening,one,Late_Night,Delhi,Economy,6.58,31,7056
1,Vistara,UK-852,Bangalore,Morning,zero,Morning,Mumbai,Business,1.92,37,20760
2,Indigo,6E-2348,Delhi,Evening,one,Late_Night,Bangalore,Economy,5.58,25,3671


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,Vistara,UK-778,Kolkata,Afternoon,one,Morning,Chennai,Business,18.58,35
1,Air_India,AI-764,Delhi,Evening,one,Late_Night,Bangalore,Economy,8.92,35
2,Air_India,AI-569,Chennai,Early_Morning,one,Evening,Bangalore,Economy,12.17,13


In [2]:
#################### EDA. ######################################################
# duration, days_left를 제외한 대부분의 변수가 object 타입
# 결측치 없음
# 편명(flight)의 카테고리는 1천개가 넘으며, train/test간 카테고리 불일치 --> 제거
################################################################################

# 1.타겟 분리
y = train['price']
X_train = train.drop('price', axis=1)
X_train = X_train.drop('flight', axis=1)
X_test = test.drop('flight', axis=1)

# 수치형 변수 스케일링 추가
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = ['duration', 'days_left']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.fit_transform(X_test[num_cols])

# 2. 원-핫 인코딩
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
print(X_train.shape, X_test.shape)

# 검증데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y, random_state=42)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

# model, eval
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
model = RandomForestRegressor(random_state=120)
model.fit(X_train, y_train)

model_pred = model.predict(X_valid)
score = root_mean_squared_error(y_valid, model_pred)
print('RMSE = ', score)

###################### result ######################
# 1st rmse = 3804.941982412195
# 2nd (remove flight) =  4316.896945959934
# 3rd (numeric cols standard scaling) = 4317.575302583664
####################################################

(10505, 37) (4502, 37)
(7878, 37) (2627, 37) (7878,) (2627,)
RMSE =  4317.575302583664


-----
### 02. 노트북 가격예측
- 타겟 : Price
- 평가 : R2
- 제출 : result.csv (pred)

In [52]:
train = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/laptop_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/laptop_test.csv')
print(train.shape, test.shape)
display(train.head(3), test.head(3))

(91, 10) (39, 9)


Unnamed: 0,Brand,Model,Series,Processor,Processor_Gen,RAM,Hard_Disk_Capacity,OS,Rating,Price
0,ASUS,VivoBook,15.0,i3,10th,8.0,512 GB SSD,Windows 11 Home,4.3,37940
1,DELL,Inspiron,,i3,11th,8.0,1 TB HDD,Windows 11 Home,3.7,39040
2,ASUS,VivoBook,15.0,i7,10th,16.0,512 GB SSD,Windows 11 Home,4.1,57940


Unnamed: 0,Brand,Model,Series,Processor,Processor_Gen,RAM,Hard_Disk_Capacity,OS,Rating
0,DELL,Vostro,,i3,10th,8.0,256 GB SSD,Windows 10 Home,4.3
1,Lenovo,IdeaPad,3.0,i3,10th,8.0,256 GB SSD,Windows 11 Home,4.3
2,HP,,,i5,11th,8.0,512 GB SSD,Windows 11 Home,4.4


In [53]:
###### EDA #######################################################################################
# RAM, Rating 제외 모두 obj 타입 변수
# 결측치 다수, 특히 Series는 30% 이상이 결측치
# train/test간에 카테고리가 다른 obj타입 변수 다수
##################################################################################################

#1. 타겟분리, Series 삭제
y = train['Price']
train = train.drop(['Price', 'Series'], axis=1)
test = test.drop(['Series'], axis=1)
print(train.shape, test.shape)

#2. 결측치 처리, Encodeing
df = pd.concat([train, test])
cat_cols = df.select_dtypes('object').columns
num_cols = ['RAM']
df[cat_cols] = df[cat_cols].fillna('X')
df[num_cols] = df[num_cols].fillna(-1)

#from sklearn.preprocessing import LabelEncoder
#for col in cat_cols:
#  le = LabelEncoder()
#  df[col] = le.fit_transform(df[col])

df = pd.get_dummies(df)

train = df[:len(train)]
test = df[len(train):]

#3. 검증데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.2, random_state=42)

#4. model, eval
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
model = RandomForestRegressor(random_state=120)
model.fit(X_train, y_train)

model_pred = model.predict(X_valid)
score = r2_score(y_valid, model_pred)

print('R2 score = ', score)

(91, 8) (39, 8)
R2 score =  0.7954255089795438


-----
### 03. 중고차 가격예측
- 타겟 : Price
- 평가 : RMSLE
- 제출 : result.csv (pred)

In [54]:
train = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/car_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/car_test.csv')
print(train.shape, test.shape)
display(train.head(3), test.head(3))

(6732, 17) (5772, 16)


Unnamed: 0,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,13956,603,LEXUS,RX 450,2015,Jeep,Yes,Hybrid,3.5,143619 km,6.0,Automatic,4x4,04-May,Left wheel,Black,12
1,26108,640,SSANGYONG,REXTON,2013,Jeep,Yes,Diesel,2.0,111307 km,4.0,Automatic,Front,04-May,Left wheel,White,4
2,549,1493,MERCEDES-BENZ,GLE 350,2016,Jeep,Yes,Petrol,3.5,91493 km,6.0,Automatic,Rear,04-May,Left wheel,Black,0


Unnamed: 0,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,730,SSANGYONG,Actyon,2016,Jeep,Yes,Petrol,1.6,70940 km,4.0,Automatic,Front,04-May,Left wheel,Black,4
1,609,TOYOTA,Camry,2018,Sedan,Yes,Hybrid,2.5,32000 km,4.0,Automatic,Front,04-May,Left wheel,Black,12
2,761,TOYOTA,Prius,2010,Hatchback,No,Hybrid,1.8,135797 km,4.0,Automatic,Front,04-May,Left wheel,Red,0


In [80]:
################ EDA ################################################
# obj타입 변수 다수, 결측치는 없음
# train/test 변수들간 카테고리 차이 많음
# 카테고리 종류가 매우 많음 : 제조사 55, 모델 864
# Mileage는 km를 삭제하면 숫자형으로 변경 가능
# Engine volume 배기량(숫자)와 Turbo/Non_Turbo로 구분가능
#####################################################################

train = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/car_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part2/ch8/car_test.csv')

#1. target 분리
y = train['Price']
train = train.drop('Price', axis=1)

#2. 전처리 : 마일리지에서 km 제거, Turbo/Non_Turbo 구분, 배기량은 숫자형
df = pd.concat([train, test])
df['Mileage'] = df['Mileage'].str.replace(" km", "").astype(int)
df['Turbo'] = df['Engine volume'].str.contains('Turbo').astype(int)
df['Engine volume'] = df['Engine volume'].str.replace("Turbo", "").astype(float)

#3. LabelEncoding
from sklearn.preprocessing import LabelEncoder
cat_cols = df.select_dtypes('object').columns
for col in cat_cols:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col])

#4. 검증데이터 분리
train = df[:len(train)]
test = df[len(train):]

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.2, random_state=42)

#5. model, eval
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_log_error
model = RandomForestRegressor(random_state=120)
model.fit(X_train, y_train)

model_pred = model.predict(X_valid)
score = root_mean_squared_log_error(y_valid, model_pred)
print('RMSLE score = ', score)

RMSLE score =  1.0859803442965978


In [82]:
#submit
pred = model.predict(test)
result = pd.DataFrame({'pred':pred})
result.to_csv('result.csv', index=False)