# 1. 항공권 가격 예측 

### 1. 베이스라인

In [4]:
import pandas as pd
train = pd.read_csv('flight_train.csv')
test = pd.read_csv('flight_test.csv')

train.shape, test.shape

((10505, 11), (4502, 10))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10505 entries, 0 to 10504
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           10505 non-null  object 
 1   flight            10505 non-null  object 
 2   source_city       10505 non-null  object 
 3   departure_time    10505 non-null  object 
 4   stops             10505 non-null  object 
 5   arrival_time      10505 non-null  object 
 6   destination_city  10505 non-null  object 
 7   class             10505 non-null  object 
 8   duration          10505 non-null  float64
 9   days_left         10505 non-null  int64  
 10  price             10505 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 902.9+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           4502 non-null   object 
 1   flight            4502 non-null   object 
 2   source_city       4502 non-null   object 
 3   departure_time    4502 non-null   object 
 4   stops             4502 non-null   object 
 5   arrival_time      4502 non-null   object 
 6   destination_city  4502 non-null   object 
 7   class             4502 non-null   object 
 8   duration          4502 non-null   float64
 9   days_left         4502 non-null   int64  
dtypes: float64(1), int64(1), object(8)
memory usage: 351.8+ KB


In [7]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [8]:
cols = train.select_dtypes(include = 'object').columns
for col in cols: 
    set_train = set(train[col])
    set_test = set(test[col])
    same = (set_test == set_train)
    if same:
        print(col, "동일")
    else:
        print(col, "비동일")

airline 동일
flight 비동일
source_city 동일
departure_time 동일
stops 동일
arrival_time 동일
destination_city 동일
class 동일


In [9]:
# 데이터 전처리 
target = train.pop('price')

train = train.drop('flight', axis = 1)
test = test.drop('flight', axis = 1)

train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 검증 데이터 나누기 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 머신러닝 학습 및 평가 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

# RMSE
from sklearn.metrics import mean_squared_error
result= mean_squared_error(y_val, pred)
print(result**0.5)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result.csv', index = False)

4376.841613585934


### 2. 성능 개선

In [10]:
# 라이브러리 및 데이터 불러오기 
import pandas as pd
train = pd.read_csv('flight_train.csv')
test = pd.read_csv('flight_test.csv')

# 데이터 전처리 
target = train.pop('price')

# flight 칼럼 일부 사용
train['f2'] = train['flight'].str.split('-').str[1].astype(int)
test['f2'] = test['flight'].str.split('-').str[1].astype(int)

train = train.drop('flight', axis = 1)
test = test.drop('flight', axis = 1)

# 스케일링
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
cols = ['duration', 'days_left']
train[cols] = sc.fit_transform(train[cols])
test[cols] = sc.transform(test[cols])

#원-핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

#검증 데이터 나누기 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state =0)

#머신러닝 학습 및 평가 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth = 20, n_estimators = 200, random_state =0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

#RMSE
from sklearn.metrics import mean_squared_error
score = mean_squared_error(y_val, pred)
print(score**0.5)

pred = rf.predict(test)
submit= pd.DataFrame({'pred' : pred})
submit.to_csv('result.csv', index = False)

3675.155093297134


# 2. 노트북 가격 예측

### 1. 베이스라인

In [39]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv('laptop_train.csv')
test = pd.read_csv('laptop_test.csv')
train.shape, test.shape

((91, 10), (39, 9))

In [40]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Brand               91 non-null     object 
 1   Model               82 non-null     object 
 2   Series              55 non-null     object 
 3   Processor           86 non-null     object 
 4   Processor_Gen       86 non-null     object 
 5   RAM                 85 non-null     float64
 6   Hard_Disk_Capacity  85 non-null     object 
 7   OS                  85 non-null     object 
 8   Rating              91 non-null     float64
 9   Price               91 non-null     int64  
dtypes: float64(2), int64(1), object(7)
memory usage: 7.2+ KB


In [41]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Brand               39 non-null     object 
 1   Model               34 non-null     object 
 2   Series              25 non-null     object 
 3   Processor           37 non-null     object 
 4   Processor_Gen       37 non-null     object 
 5   RAM                 37 non-null     float64
 6   Hard_Disk_Capacity  37 non-null     object 
 7   OS                  37 non-null     object 
 8   Rating              39 non-null     float64
dtypes: float64(2), object(7)
memory usage: 2.9+ KB


In [42]:
train.isnull().sum()

Brand                  0
Model                  9
Series                36
Processor              5
Processor_Gen          5
RAM                    6
Hard_Disk_Capacity     6
OS                     6
Rating                 0
Price                  0
dtype: int64

In [43]:
test.isnull().sum()

Brand                  0
Model                  5
Series                14
Processor              2
Processor_Gen          2
RAM                    2
Hard_Disk_Capacity     2
OS                     2
Rating                 0
dtype: int64

In [44]:
cols = train.select_dtypes(include = 'object').columns
for col in cols:
    train_set = set(train[col])
    test_set = set(test[col])
    same = (train_set == test_set)
    if same : 
        print(col, '동일')
    else:
        print(col, "비동일")

Brand 비동일
Model 비동일
Series 비동일
Processor 비동일
Processor_Gen 비동일
Hard_Disk_Capacity 비동일
OS 비동일


In [45]:
# 데이터 전처리 
target = train.pop('Price')

#결측치 처리(범주형)
c_cols = ['Model', 'Series', 'Processor', 'Processor_Gen', 'Hard_Disk_Capacity', 'OS']
train[c_cols] = train[c_cols].fillna('X')
test[c_cols] = test[c_cols].fillna('X')

#결측치 처리(수치형)
n_cols = ['RAM']
train[n_cols] = train[n_cols].fillna(-1)
test[n_cols] = train[n_cols].fillna(-1)

#원-핫 인코딩
combined = pd.concat([train,test])
combined_dummies = pd.get_dummies(combined)
train = combined_dummies[:len(train)]
test = combined_dummies[len(train):]

#검증 데이터 나누기 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

#모델 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

from sklearn.metrics import r2_score
score = r2_score(y_val, pred)
print(score)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result2.csv', index = False)

0.6902268393874075


### 2. 성능 개선

In [4]:
# 라이브러리 및 데이터 불어오기
import pandas as pd
train = pd.read_csv('laptop_train.csv')
test = pd.read_csv('laptop_test.csv')

# 데이터 전처리
target = train.pop('Price')

# 결측치 삭제
train = train.drop('Series', axis = 1)
test = test.drop('Series', axis = 1)

# 결측치 삭제
train = train.drop('Model', axis = 1)
test = test.drop('Model', axis =1 )

# 결측치 처리(범주형)
c_cols = ['Processor', 'Processor_Gen', 'Hard_Disk_Capacity', 'OS']
train[c_cols] = train[c_cols].fillna("X")
test[c_cols] = test[c_cols].fillna("X")

# 결측치 처리(수치형)
n_cols = ['RAM']
train[n_cols] = train[n_cols].fillna(-1)
test[n_cols] = train[n_cols].fillna(-1)

# 원 - 핫 인코딩
combined = pd.concat([train, test])
combined_dummies = pd.get_dummies(combined)
train = combined_dummies[:len(train)]
test = combined_dummies[len(train):]

#검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

#머신러닝 학습 및 평가 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

from sklearn.metrics import r2_score
score = r2_score(y_val, pred)
print(score)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result2.csv', index = False)

0.8042392429064131


# 3. 중고차 가격 예측

### 1. 베이스라인

In [30]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv('car_train.csv')
test = pd.read_csv('car_test.csv')
train.shape, test.shape

((6732, 17), (5772, 16))

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6732 entries, 0 to 6731
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             6732 non-null   int64  
 1   Levy              6732 non-null   object 
 2   Manufacturer      6732 non-null   object 
 3   Model             6732 non-null   object 
 4   Prod. year        6732 non-null   int64  
 5   Category          6732 non-null   object 
 6   Leather interior  6732 non-null   object 
 7   Fuel type         6732 non-null   object 
 8   Engine volume     6732 non-null   object 
 9   Mileage           6732 non-null   object 
 10  Cylinders         6732 non-null   float64
 11  Gear box type     6732 non-null   object 
 12  Drive wheels      6732 non-null   object 
 13  Doors             6732 non-null   object 
 14  Wheel             6732 non-null   object 
 15  Color             6732 non-null   object 
 16  Airbags           6732 non-null   int64  


In [32]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5772 entries, 0 to 5771
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Levy              5772 non-null   object 
 1   Manufacturer      5772 non-null   object 
 2   Model             5772 non-null   object 
 3   Prod. year        5772 non-null   int64  
 4   Category          5772 non-null   object 
 5   Leather interior  5772 non-null   object 
 6   Fuel type         5772 non-null   object 
 7   Engine volume     5772 non-null   object 
 8   Mileage           5772 non-null   object 
 9   Cylinders         5772 non-null   float64
 10  Gear box type     5772 non-null   object 
 11  Drive wheels      5772 non-null   object 
 12  Doors             5772 non-null   object 
 13  Wheel             5772 non-null   object 
 14  Color             5772 non-null   object 
 15  Airbags           5772 non-null   int64  
dtypes: float64(1), int64(2), object(13)
memory

In [33]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [34]:
cols = train.select_dtypes(include = 'object').columns
for col in cols: 
    set_train = set(train[col])
    set_test = set(test[col])
    same = (set_train == set_test)
    if same : 
        print(col, "카테고리 동일")
    else:
        print(col, "카테고리 비동일")

Levy 카테고리 비동일
Manufacturer 카테고리 비동일
Model 카테고리 비동일
Category 카테고리 동일
Leather interior 카테고리 동일
Fuel type 카테고리 비동일
Engine volume 카테고리 비동일
Mileage 카테고리 비동일
Gear box type 카테고리 동일
Drive wheels 카테고리 동일
Doors 카테고리 동일
Wheel 카테고리 동일
Color 카테고리 동일


In [35]:
# 데이터 전처리
target = train.pop('Price')

#레이블 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
combined = pd.concat([train,test])
cols = combined.select_dtypes(include = 'object').columns

for col in cols:
    combined[col] = le.fit_transform(combined[col])
    
train = combined[:len(train)]
test = combined[len(train):]

# 검증 데이터 나누기 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 모델 학습 및 평가
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

from sklearn.metrics import mean_squared_log_error
score = mean_squared_log_error(y_val, pred)
print(score**0.5)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result3.csv', index = False)

1.1008952910276844


### 2. 성능 개선

In [37]:
train = pd.read_csv('car_train.csv')
train['Engine volume'].value_counts()

Engine volume
2            1342
2.5           823
1.8           623
1.6           533
1.5           453
             ... 
0.8 Turbo       1
3.1             1
4.6 Turbo       1
4.2 Turbo       1
4.8 Turbo       1
Name: count, Length: 91, dtype: int64

In [38]:
train['Mileage'].str.split().str[1].value_counts()

Mileage
km    6732
Name: count, dtype: int64

In [43]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv('car_train.csv')
test = pd.read_csv('car_test.csv')

# 데이터 전처리
target = train.pop('Price')

# Engine  volume 자료형 변경 및 Turbo 칼럼 생성 
train['Turbo'] = train['Engine volume'].str.contains('Turbo').astype(int)
train['Engine volume'] = train['Engine volume'].str.replace('Turbo', '').astype(float)

test['Turbo'] = test['Engine volume'].str.contains('Turbo').astype(int)
test['Engine volume'] = test['Engine volume'].str.replace('Turbo', '').astype(float)

# Mileage 자료형 변경
train['Mileage'] = train['Mileage'].str.split().str[0].astype(int)
test['Mileage'] = test['Mileage'].str.split().str[0].astype(int)

#레이블 인코딩 
combined = pd.concat([train,test])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = train.select_dtypes(include = 'object').columns
for col in cols:
    combined[col] = le.fit_transform(combined[col])

train = combined[:len(train)]
test = combined[len(train):]

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 200, random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
from sklearn.metrics import mean_squared_log_error
score = mean_squared_log_error(y_val, pred) ** 0.5
print(score)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result3.csv', index= False)

1.082016203127291
