In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import catboost as cb
from sklearn.model_selection import train_test_split
from scipy import stats



In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# csv to parquet

In [3]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [4]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


# Data load

In [5]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [6]:
print(train.columns)
print(test.columns)
print(sample_submission.columns)

Index(['ID', 'Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number', 'Delay'],
      dtype='object')
Index(['ID', 'Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number'],
      dtype='object')
Index(['Not_Delayed', 'Delayed'], dtype='object')


In [7]:
print(train.Delay.value_counts())
print()
print('train.csv의 Delay Column 결측치 비율:', round(train.Delay.isnull().sum()/len(train.Delay)*100,3),'%')

Delay
Not_Delayed    210001
Delayed         45000
Name: count, dtype: int64

train.csv의 Delay Column 결측치 비율: 74.5 %


### train의 결측치가 아닌 비율은 25.5%
### 25.5%로 학습 -> 나머지 74.5%를 예측

### 이렇게 label된 100만개 데이터로 test.csv 예측


# Data Pre-Processing

In [8]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [9]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [10]:
#레이블이 없는 데이터들을 제거합니다
train_label_is_null = train[train.Delay.isnull()]
train = train.dropna()

In [11]:
train_label_is_null

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,


In [12]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
5,TRAIN_000005,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,Delayed
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999962,TRAIN_999962,10,11,600.0,2003.0,0,0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,Not_Delayed
999963,TRAIN_999963,5,2,1759.0,1926.0,0,0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,Delayed
999969,TRAIN_999969,10,10,940.0,1056.0,0,0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,Delayed
999985,TRAIN_999985,8,8,1914.0,2039.0,0,0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,Not_Delayed


In [13]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [14]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
5,TRAIN_000005,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed,0
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999962,TRAIN_999962,10,11,600.0,2003.0,0,0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,Not_Delayed,0
999963,TRAIN_999963,5,2,1759.0,1926.0,0,0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,Delayed,1
999969,TRAIN_999969,10,10,940.0,1056.0,0,0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,Delayed,1
999985,TRAIN_999985,8,8,1914.0,2039.0,0,0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,Not_Delayed,0


In [15]:
train.drop(columns={'Cancelled','Diverted'}, inplace=True)
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
5,TRAIN_000005,4,13,1545.0,1900.0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed,0
6,TRAIN_000006,1,20,1742.0,1903.0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999962,TRAIN_999962,10,11,600.0,2003.0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,Not_Delayed,0
999963,TRAIN_999963,5,2,1759.0,1926.0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,Delayed,1
999969,TRAIN_999969,10,10,940.0,1056.0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,Delayed,1
999985,TRAIN_999985,8,8,1914.0,2039.0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,Not_Delayed,0


In [16]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [17]:
print(len(train_x))
train_y.value_counts()

255001


Delay_num
0    210001
1     45000
Name: count, dtype: int64

# Classification Model Fit

In [18]:
# 1. 레이블이 있는 데이터로 분류 모델을 훈련합니다.

# CatBoostClassifier 모델 초기화
model = cb.CatBoostClassifier(task_type="GPU", devices='0:1', verbose=200)

# 튜닝할 하이퍼파라미터 설정
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearchCV 사용
grid_search_train = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=10)
grid_search_train.fit(train_x, train_y)

# 최적의 하이퍼파라미터 출력
print("Best Parameters: ", grid_search_train.best_params_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 1/3; 1/54] START depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01.
0:	learn: 0.6878435	total: 25.5ms	remaining: 12.7s
200:	learn: 0.4533653	total: 1.49s	remaining: 2.21s
400:	learn: 0.4467466	total: 2.83s	remaining: 700ms
499:	learn: 0.4453624	total: 3.49s	remaining: 0us
[CV 1/3; 1/54] END depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=-0.447 total time=   5.3s
[CV 2/3; 1/54] START depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01.
0:	learn: 0.6878600	total: 9.47ms	remaining: 4.72s
200:	learn: 0.4533153	total: 1.38s	remaining: 2.06s
400:	learn: 0.4466160	total: 2.75s	remaining: 679ms
499:	learn: 0.4452690	total: 3.43s	remaining: 0us
[CV 2/3; 1/54] END depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=-0.447 total time=   3.7s
[CV 3/3; 1/54] START depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01.
0:	learn: 0.6878394	total: 6.67ms	remaining: 3.33s
200:	learn

In [19]:
best_params_train = grid_search_train.best_params_

best_model_train = cb.CatBoostClassifier(
    iterations=best_params_train['iterations'],
    learning_rate=best_params_train['learning_rate'],
    depth=best_params_train['depth'],
    l2_leaf_reg=best_params_train['l2_leaf_reg'],
    task_type="GPU", 
    devices='0:1', 
    verbose=200
)

best_model_train.fit(train_x, train_y)

0:	learn: 0.6671172	total: 10.4ms	remaining: 10.4s
200:	learn: 0.4412630	total: 1.61s	remaining: 6.39s
400:	learn: 0.4360185	total: 3.23s	remaining: 4.83s
600:	learn: 0.4323136	total: 4.86s	remaining: 3.23s
800:	learn: 0.4290991	total: 6.48s	remaining: 1.61s
999:	learn: 0.4262582	total: 8.09s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2051e1512e0>

In [20]:
# 2. 훈련된 모델을 사용하여 레이블이 없는 데이터의 레이블을 예측합니다.
train_label_is_null_x = train_label_is_null.drop(['ID', 'Delay'], axis=1)
train_label_is_null_y = best_model_train.predict(train_label_is_null_x)

# TEST (train test split을 통한 정확도 측정)

In [21]:
train_x_, valid_x_, train_y_, valid_y_ = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

In [22]:
# 검증 데이터에 대한 예측
valid_predictions = best_model_train.predict(valid_x_)

# 예측값이 2D 배열로 반환될 수 있으므로, 1D로 변환합니다.
# valid_predictions = valid_predictions.reshape(-1)

# 정확도 계산
accuracy = accuracy_score(valid_y_, valid_predictions)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 82.39%


# TEST END

In [28]:
pd.Series(train_label_is_null_y).value_counts()

0    741309
1      3690
Name: count, dtype: int64

In [29]:
train.Delay_num.value_counts()

Delay_num
0    210001
1     45000
Name: count, dtype: int64

In [30]:
# 'ID', 'Delay_num'컬럼 생성 및 컬럼 병합
trained_label_is_null = train_label_is_null_x.copy()
trained_label_is_null['ID'] = train_label_is_null.ID
trained_label_is_null['Delay_num'] = train_label_is_null_y
trained_label_is_null['Delay'] = trained_label_is_null['Delay_num'].map(lambda x: 'Delayed' if x==1 else 'Not_Delayed')

# 컬럼 순서 변경
trained_label_is_null = trained_label_is_null[['ID','Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number', 'Delay',
       'Delay_num']]
trained_label_is_null

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
0,TRAIN_000000,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,Not_Delayed,0
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,Not_Delayed,0
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,Not_Delayed,0
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,Not_Delayed,0
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,Not_Delayed,0
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,Not_Delayed,0
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,Not_Delayed,0
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,Not_Delayed,0


In [31]:
# 기존 labeling된 DF와 예측된 DF 병합
combined_df = pd.concat([trained_label_is_null, train])
combined_df = combined_df.sort_index(ascending=True)
combined_df.head(10)

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
0,TRAIN_000000,4,15,600.0,1900.0,0.0,0.0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,Not_Delayed,0
1,TRAIN_000001,8,15,740.0,1024.0,0.0,0.0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,Not_Delayed,0
2,TRAIN_000002,9,6,1610.0,1805.0,0.0,0.0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,Not_Delayed,0
3,TRAIN_000003,7,10,905.0,1735.0,0.0,0.0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,Not_Delayed,0
4,TRAIN_000004,1,11,900.0,1019.0,0.0,0.0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,Not_Delayed,0
5,TRAIN_000005,4,13,1545.0,1900.0,,,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed,0
6,TRAIN_000006,1,20,1742.0,1903.0,,,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed,0
7,TRAIN_000007,4,20,1815.0,1955.0,0.0,0.0,256,13930,11,217,13198,23,403.0,23,8,20304.0,173,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,,,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed,0
9,TRAIN_000009,6,6,650.0,838.0,0.0,0.0,207,12992,3,169,12266,42,374.0,13,8,20366.0,468,Not_Delayed,0


In [33]:
# 모델 다시 학습
combined_df_x = combined_df.drop(['ID','Delay','Delay_num'], axis=1)
combined_df_y = combined_df['Delay_num']

# CatBoostClassifier 모델 초기화
model = cb.CatBoostClassifier(task_type="GPU", devices='0:1', verbose=200)

# 튜닝할 하이퍼파라미터 설정
param_grid = {
    'iterations': [200, 500, 700, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearchCV 사용
# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=10, n_jobs=-1)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=10)
grid_search.fit(combined_df_x, combined_df_y)

# 최적의 하이퍼파라미터 출력
print("Best Parameters: ", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV 1/3; 1/108] START depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01
0:	learn: 0.6658329	total: 25.2ms	remaining: 5.02s
199:	learn: 0.1319805	total: 2.36s	remaining: 0us
[CV 1/3; 1/108] END depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01;, score=-0.133 total time=   3.4s
[CV 2/3; 1/108] START depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01
0:	learn: 0.6659409	total: 12.1ms	remaining: 2.4s
199:	learn: 0.1327324	total: 2.43s	remaining: 0us
[CV 2/3; 1/108] END depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01;, score=-0.132 total time=   3.0s
[CV 3/3; 1/108] START depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01
0:	learn: 0.6659824	total: 12.6ms	remaining: 2.52s
199:	learn: 0.1321782	total: 2.37s	remaining: 0us
[CV 3/3; 1/108] END depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01;, score=-0.132 total time=   2.9s
[CV 1/3; 2/108] START depth=6, iterations=200, l2_lea

In [None]:
# 최적의 하이퍼파라미터로 모델 학습
best_params = grid_search.best_params_

best_model = cb.CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    task_type="GPU", 
    devices='0:1', 
    verbose=200
)

best_model.fit(combined_df_x, combined_df_y)

# Inference

In [None]:
y_pred = best_model.predict_proba(test_x)

# Submit

In [None]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [None]:
submission.to_csv('baseline_submission.csv', index=True)

In [None]:
pd.read_csv('baseline_submission.csv')

In [None]:
pd.read_csv('sample_submission.csv')