### 회귀

#### 문제1 | 항공권 가격 예측

In [21]:
# 데이터 : flight_train.csv, flight_test.csv
# 예측값 : price 가격
# 평가 지표 : RMSE

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor ### 주목 ### 랜포도 회귀에 사용 가능한 모델 있음
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

train = pd.read_csv('flight_train.csv')
test = pd.read_csv('flight_test.csv')
print(train.shape, test.shape)

# EDA
display(train.head(2))
# display(test.head(2))
# train.info()
# test.info()
# train['price'].describe()
# 결측값 없고 오브젝트 타입이 많음
# display(train.describe(include='O'))
# display(test.describe(include='O'))

# 전처리
target = train.pop('price')
print(train.shape, test.shape)
df = pd.concat([train,test])
print(df.shape)

# 인코딩
for col_name in df.select_dtypes(include='O').columns.to_list():
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name])
    
# 데이터 분할
train = df.iloc[:len(train)].copy()
test = df.iloc[len(train):].copy()
# display(train.head(2))
# display(test.head(2))
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 스케일링
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
test = scaler.transform(test)

# 모델 학습 및 검증
# model = SGDRegressor(random_state=0)
model = RandomForestRegressor(random_state=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print(mean_squared_error(y_val, y_pred)**0.5)

# 예측 및 제출
pred = model.predict(test)
result = pd.DataFrame({'pred':pred})
result.to_csv('result.csv', index=False)
# pd.read_csv('result.csv')

(10505, 11) (4502, 10)


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,Vistara,UK-776,Kolkata,Evening,one,Late_Night,Delhi,Economy,6.58,31,7056
1,Vistara,UK-852,Bangalore,Morning,zero,Morning,Mumbai,Business,1.92,37,20760


(10505, 10) (4502, 10)
(15007, 10)
3782.986566193416


#### 문제2 | 노트북 가격 예측

In [23]:
# 데이터 : laptop_train.csv, laptop_test.csv
# 예측값 : price
# 평가 지표: R2 score

In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score

train = pd.read_csv('laptop_train.csv')
test = pd.read_csv('laptop_test.csv')
# print(train.shape, test.shape)

# EDA
display(train.head(2))
# display(test.head(2))
# Hard_Disk_Capacity 분리하면 좋을 것 같음
# 오브젝트 타입 인코딩 필요
# print(train.isna().sum())
# print(test.isna().sum())
# 결측값 있음
# train['Price'].describe()

# 전처리
target = train.pop('Price')
# print(train.shape, test.shape)
df = pd.concat([train, test])
# print(df.shape)

# 결측치 처리
# 결측치 많은 컬럼 삭제
df = df.drop(columns=['Model'])
# 'X'로 채우기
df['RAM'] = df['RAM'].astype(str)
df = df.fillna('X')

# Hard_Disk_Capacity 분리하기
df['Hard_Disk_Capacity_1'] = df['Hard_Disk_Capacity'].str.split(' ').str[:2].str.join(' ')
df['Hard_Disk_Capacity_2'] = df['Hard_Disk_Capacity'].str.split(' ').str[-1]
df = df.drop(columns='Hard_Disk_Capacity')

# 인코딩
for col_name in df.select_dtypes(include='O').columns.to_list():
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name].astype(object))

# 스케일링, 결과보고 다시

# 데이터 분리
train = df.iloc[:len(train)].copy()
test = df.iloc[len(train):].copy()
# print(train.shape, test.shape)
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
# print(x_train.shape, y_train.shape)
# print(x_val.shape, y_val.shape)


# 모델 학습 및 검증
model = RandomForestRegressor(random_state=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print(r2_score(y_val, y_pred))

# 예측 및 제출
pred = model.predict(test)
result = pd.DataFrame({'pred':pred})
result.to_csv('result.csv', index=False)
# pd.read_csv('result.csv')

Unnamed: 0,Brand,Model,Series,Processor,Processor_Gen,RAM,Hard_Disk_Capacity,OS,Rating,Price
0,ASUS,VivoBook,15.0,i3,10th,8.0,512 GB SSD,Windows 11 Home,4.3,37940
1,DELL,Inspiron,,i3,11th,8.0,1 TB HDD,Windows 11 Home,3.7,39040


0.8002583947958681


#### 문제3 | 중고차 가격 예측

In [25]:
# 데이터: car_train.csv, car_test.csv
# 예측할 컬럼: price
# 평가 지표: RMSLE

In [26]:
import pandas as pd

train = pd.read_csv('car_train.csv')
test = pd.read_csv('car_test.csv')
print(train.shape, test.shape)

(6732, 17) (5772, 16)


In [27]:
# EDA

In [28]:
display(train.head(3))
display(test.head(3))

Unnamed: 0,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,13956,603,LEXUS,RX 450,2015,Jeep,Yes,Hybrid,3.5,143619 km,6.0,Automatic,4x4,04-May,Left wheel,Black,12
1,26108,640,SSANGYONG,REXTON,2013,Jeep,Yes,Diesel,2.0,111307 km,4.0,Automatic,Front,04-May,Left wheel,White,4
2,549,1493,MERCEDES-BENZ,GLE 350,2016,Jeep,Yes,Petrol,3.5,91493 km,6.0,Automatic,Rear,04-May,Left wheel,Black,0


Unnamed: 0,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,730,SSANGYONG,Actyon,2016,Jeep,Yes,Petrol,1.6,70940 km,4.0,Automatic,Front,04-May,Left wheel,Black,4
1,609,TOYOTA,Camry,2018,Sedan,Yes,Hybrid,2.5,32000 km,4.0,Automatic,Front,04-May,Left wheel,Black,12
2,761,TOYOTA,Prius,2010,Hatchback,No,Hybrid,1.8,135797 km,4.0,Automatic,Front,04-May,Left wheel,Red,0


In [29]:
# 결측치 확인
print(sum(train.isna().sum()))
print(sum(test.isna().sum()))

0
0


In [30]:
# 데이터 타입 확인
train.info()
test.info()
# 오브젝트 타입이 많아서 인코딩 필요

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6732 entries, 0 to 6731
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             6732 non-null   int64  
 1   Levy              6732 non-null   object 
 2   Manufacturer      6732 non-null   object 
 3   Model             6732 non-null   object 
 4   Prod. year        6732 non-null   int64  
 5   Category          6732 non-null   object 
 6   Leather interior  6732 non-null   object 
 7   Fuel type         6732 non-null   object 
 8   Engine volume     6732 non-null   object 
 9   Mileage           6732 non-null   object 
 10  Cylinders         6732 non-null   float64
 11  Gear box type     6732 non-null   object 
 12  Drive wheels      6732 non-null   object 
 13  Doors             6732 non-null   object 
 14  Wheel             6732 non-null   object 
 15  Color             6732 non-null   object 
 16  Airbags           6732 non-null   int64  


In [31]:
# 타겟값 분포
train['Price'].describe()
# 마이너스 이상치 없음

count      6732.000000
mean      17018.565954
std       17497.072247
min           3.000000
25%        5331.000000
50%       13172.000000
75%       21953.000000
max      228935.000000
Name: Price, dtype: float64

In [32]:
# 전처리

In [33]:
target = train.pop('Price')
print(train.shape, test.shape)

df = pd.concat([train, test])
print(df.shape)

(6732, 16) (5772, 16)
(12504, 16)


In [35]:
# 인코딩
from sklearn.preprocessing import LabelEncoder

for col_name in df.select_dtypes(include='O').columns.to_list():
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name])

In [36]:
# 데이터 나누기
train = df.iloc[:len(train)].copy()
test = df.iloc[len(train):].copy()
print(train.shape, test.shape)

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(6732, 16) (5772, 16)
(5385, 16) (5385,)
(1347, 16) (1347,)


In [37]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
test = scaler.transform(test)

In [38]:
# 모델 학습 및 검증

In [58]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
import lightgbm as lgbm
from sklearn.metrics import mean_squared_log_error
model = RandomForestRegressor(random_state=0)
# model = SGDRegressor(random_state=0)
# model = lgbm.LGBMRegressor(random_state=0, force_row_wise=True)

model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print(mean_squared_log_error(y_val , y_pred))

1.2115452791726906


In [59]:
# 예측 및 제출

In [60]:
pred = model.predict(test)
result = pd.DataFrame({'pred':pred})
result.to_csv('result.csv', index=False)

pd.read_csv('result.csv')

Unnamed: 0,pred
0,38983.35
1,11606.19
2,16466.38
3,73556.18
4,47652.85
...,...
5767,12256.73
5768,6852.63
5769,10620.86
5770,26048.51
