In [1]:
import os
import pandas as pd
from datetime import datetime, timedelta
import operator
import numpy as np
import functools
from sklearn.tree import DecisionTreeRegressor as DTR

In [2]:
os.chdir("C:\\Users\\xzou\\Downloads\\dataSets\\training\\KMSP_MTI")

In [28]:
def make_train_data(time_granularity):
    all_data = pd.read_csv("volume(table 6)_training.csv")
    start_datetime = datetime(2016,9,19,0,0,0)
    tid_direction_dict = {}
    for tid in [1,2,3]:
        for direction in [0,1]:
            tid_direction_data = all_data[(all_data.tollgate_id == tid) & (all_data.direction == direction)] \
                                .drop(["tollgate_id", "direction"], axis = 1)
            if tid_direction_data.shape[0] > 0:
                tid_direction_dict[(tid, direction)] = tid_direction_data.sort(["time"])
    for key, tid_direction_df in tid_direction_dict.iteritems():
        time_list = tid_direction_df["time"].tolist()
        last_pos = 0
        pos = 0
        time_window = []
        missing_timewindow = []
        missing_records_dict = {}
        current_datetime = start_datetime
        while pos < tid_direction_df.shape[0]:
            this_time_interval_end = current_datetime + timedelta(minutes = time_granularity)
            this_datetime_str = time_list[pos]
            this_ymd = this_datetime_str.split(" ")[0]
            this_hms = this_datetime_str.split(" ")[1]
            int_this_ymd = map(int, this_ymd.split("-"))
            int_this_hms = map(int, this_hms.split(":"))
            this_datetime = datetime(int_this_ymd[0], int_this_ymd[1], int_this_ymd[2], 
                                     int_this_hms[0], int_this_hms[1], int_this_hms[2])
            if this_datetime >= current_datetime and this_datetime < this_time_interval_end:
                time_window.append(get_time_window(current_datetime))
                pos += 1
            else:
                if last_pos == pos: # insert a blank record representing the missing time window
                    missing_timewindow.append(get_time_window(current_datetime))
                else:
                    last_pos = pos
                current_datetime = this_time_interval_end

        tid_direction_df["time_window"] = time_window
        tid_direction_df["volume"] = [1] * len(time_window)
        tid_direction_v_dummies = pd.get_dummies(tid_direction_df["vehicle_model"], prefix="m")
        tid_direction_v_sep = pd.concat([tid_direction_df, tid_direction_v_dummies], axis = 1)
        if key[1] == 0:
            tid_direction_v_sep.drop(["time", "vehicle_model", "vehicle_type"], axis=1, inplace=True)
        else:
            tid_direction_v_sep.drop(["time", "vehicle_model"], axis=1, inplace=True)
        data_grouped_by_timewindow = tid_direction_v_sep.groupby(["time_window"]).sum().reset_index()
        data_grouped_by_timewindow.rename(columns={"vehicle_type": "vehicle_type_1"}, inplace=True)
        missing_records_dict["time_window"] = missing_timewindow
        for column in data_grouped_by_timewindow.columns:
            if column != "time_window":
                missing_records_dict[column] = [0] * len(missing_timewindow)
        missing_df = pd.DataFrame(missing_records_dict)
        whole_tid_direction_data = data_grouped_by_timewindow.append(missing_df, ignore_index=True).sort(["time_window"])
        tid_direction_dict[key] = whole_tid_direction_data
        whole_tid_direction_data.to_csv("T" + str(key[0]) + str(key[1]) + "ti_" + str(time_granularity) + ".csv", index=False)

In [27]:
def get_time_window(datetime):
    ymd = datetime.isoformat(" ").split(" ")[0]
    hour = time_wrapper(datetime.hour)
    minute = time_wrapper(datetime.minute)
    return ymd + " " + ":".join([hour, minute])

In [3]:
def time_wrapper(num):
    if num < 10:
        return "0" + str(num)
    else:
        return str(num)

In [4]:
def get_min_mapping(time_granularity):
    min_mapping = {}
    i = 0
    while (i+1) * time_granularity <= 60:
        if (i+1) * time_granularity < 60:
            min_mapping[time_wrapper(i*time_granularity)] = time_wrapper((i+1)*time_granularity)
        else:
            min_mapping[time_wrapper(i*time_granularity)] = "00"
        i += 1
    return min_mapping

In [5]:
def get_remove_intervals(time_granularity):
    remove_time_intervals_select = [[[0,0],[5,60-time_granularity]], 
                                    [[8,0],[14,60-time_granularity]], 
                                    [[17,0],[23,60-time_granularity]]]
    remove_time_intervals_predict_ad = [[[0,0],[7,60-time_granularity]], 
                                        [[10,0],[16,60-time_granularity]], 
                                        [[19,0],[23,60-time_granularity]]]
    remove_time_intervals_predict_am = [[[0,0],[7,60-time_granularity]], [[10,0],[23,60-time_granularity]]]
    remove_time_intervals_predict_pm = [[[0,0],[16,60-time_granularity]], [[19,0],[23,60-time_granularity]]]
    remove_time_intervals_irrelevant = [[[0,0],[5,60-time_granularity]], 
                                        [[10,0],[14,60-time_granularity]], 
                                        [[19,0],[23,60-time_granularity]]]
    return [remove_time_intervals_select, remove_time_intervals_predict_ad, 
            remove_time_intervals_predict_am, remove_time_intervals_predict_pm,
            remove_time_intervals_irrelevant]

In [6]:
def read_data(in_path):
    data = pd.read_csv(in_path)
    return data

In [7]:
def remove_by_date(data, date_interval):
    [date_low_bound, date_high_bound] = date_interval
    time_window = data["time_window"].tolist()
    ymd_time_window = [x.split(" ")[0] for x in time_window]
    ymd_list_time_window = [map(int, x.split("-")) for x in ymd_time_window]
    datetime_list = [datetime(x[0], x[1], x[2]) for x in ymd_list_time_window]
    low_datetime = datetime(date_low_bound[0], date_low_bound[1], date_low_bound[2])
    high_datetime = datetime(date_high_bound[0], date_high_bound[1], date_high_bound[2])
    remove_time_window = [True if x <= high_datetime and x >= low_datetime else False 
                          for x in datetime_list]
    data["selection"] = remove_time_window
    no_national_day_data = data.loc[data["selection"] == False]
    return no_national_day_data.drop("selection", 1)

In [8]:
def remove_by_date_intervals(data, date_intervals):
    for date_interval in date_intervals:
        data = remove_by_date(data, date_interval)
    return data

In [9]:
def datetime_hm(hour, minute):
    return datetime(1, 1, 1, hour, minute)

In [10]:
def remove_by_time(data, time_interval):
    [time_low_bound, time_high_bound] = time_interval
    time_window = data["time_window"].tolist()
    hm_time_window = [x.split(" ")[1] for x in time_window]
    hm_list_time_window = [map(int, x.split(":")) for x in hm_time_window]
    datetime_list = [datetime_hm(hour=x[0], minute=x[1]) for x in hm_list_time_window]
    low_datetime = datetime_hm(hour=time_low_bound[0], minute=time_low_bound[1])
    high_datetime = datetime_hm(hour=time_high_bound[0], minute=time_high_bound[1])
    remove_time_window = [True if x <= high_datetime and x >= low_datetime else False 
                          for x in datetime_list]
    data["selection"] = remove_time_window
    no_national_day_data = data.loc[data["selection"] == False]
    no_national_day_data.drop("selection", 1, inplace=True)
    return no_national_day_data

In [11]:
def select_by_feature(data, feature_list):
    return data[feature_list]

In [12]:
def remove_by_time_intervals(data, time_intervals):
    for time_interval in time_intervals:
        data = remove_by_time(data, time_interval)
    return data

In [13]:
def transform_time_window(row, time_interval):
    time = row.split(" ")
    ymd = time[0]
    hms = time[1]
    m = time_wrapper(int(hms.split(":")[1]) / time_interval * time_interval)
    return ymd + " " + hms.split(":")[0] + ":" + m

In [14]:
def make_test_data(tollgate_id, direction, time_granularity):
    data = pd.read_csv("volume_test1.csv")
    data_tid = data.loc[data["tollgate_id"] == tollgate_id]
    data_tid_direction = data_tid.loc[data_tid["direction"] == direction]
    tid_direction_v_dummies = pd.get_dummies(data_tid_direction["vehicle_model"], prefix="m")
    data_tid_direction_v_sep = pd.concat([data_tid_direction, tid_direction_v_dummies], axis = 1)
    data_tid_direction = data_tid_direction_v_sep.drop(["vehicle_type", "tollgate_id", "direction", "vehicle_model"], 1)
    timedata = data_tid_direction["time"].tolist()
    time_window = map(functools.partial(transform_time_window, time_interval=time_granularity), timedata)
    data_tid_direction["time_window"] = time_window
    data_tid_direction = data_tid_direction.drop(["time"], 1)
    data_tid_direction["volume"] = [1] * len(timedata)
    data_grouped_by_timewindow = data_tid_direction.groupby(["time_window"]).sum().reset_index()
    return data_grouped_by_timewindow

In [15]:
def ind_ymd_mapping(data):
    time_window = data["time_window"].tolist()
    seen = set()
    ymd = [x.split(" ")[0] for x in time_window]
    distinct_ymd = [x for x in ymd if not (x in seen or seen.add(x))]
    return dict(zip(range(len(distinct_ymd)), distinct_ymd))

In [16]:
def columns_by_hm(data, transform_list):
    columns_hm_dict = {}
    existent_columns = data.columns.tolist()
    transform_columns = []
    for column in transform_list:
        if column in existent_columns and column != "time_window":
            transform_columns.append(column)
    time_window = data["time_window"].tolist()
    ymd = [x.split(" ")[0] for x in time_window]
    hm = [x.split(" ")[1] for x in time_window]
    hm_type = list(set(hm))
    for column in transform_columns:
        column_content = data[column].tolist()
        for some_hm in hm_type:
            columns_hm_dict[column + "^" + some_hm] = [column_content[i] 
                                                      for i in range(len(column_content)) 
                                                      if hm[i] == some_hm]
    columns_hm_df = pd.DataFrame(columns_hm_dict)
    return columns_hm_df

In [17]:
def hour_plus(hour, plus):
    next_hour = int(hour) + plus
    return "0" + str(next_hour) if next_hour < 10 else str(next_hour)

In [18]:
def format_datetime(hm_list, ind_ymd_dict, pred_interval, min_mapping, time_granularity):
    if pred_interval == "AM":
        hm_list = hm_list[:len(hm_list)/2]
    if pred_interval == "PM":
        hm_list = hm_list[len(hm_list)/2:]
    time_window = []
    left_min = [x.split(":")[1] for x in hm_list]
    right_min = [min_mapping[x] for x in left_min]
    left_hour = [hour_plus(x.split(":")[0], 2) for x in hm_list]
    right_hour = [left_hour[i] if left_min[i] != str(60-time_granularity) else hour_plus(left_hour[i], 1) 
                  for i in range(len(left_hour))]
    ymd_list = ind_ymd_dict.values()
    for ymd in ymd_list:
        ymd_datetime = ["[" + ymd + " " + left_hour[i] + ":" + left_min[i] + ":" + "00" + "," + 
                       ymd + " " + right_hour[i] + ":" + right_min[i] + ":" + "00" + ")" for i in range(len(left_hour))]
        time_window.extend(ymd_datetime)
    return time_window

In [19]:
def get_prediction(select_df, predict_df, test_df, top_k):
    select_df.to_csv("select_df.csv",index=False)
    predict_df.to_csv("predict_df.csv",index=False)
    test_df.to_csv("test_df.csv",index=False)
    predictions = {}
    seen = set()
    repe_columns = [x.split("^")[0] for x in predict_df.columns.tolist()]
    columns = [x for x in repe_columns if not (x in seen or seen.add(x))]
    for column in columns:
        predictions[column] = []
    for i in range(test_df.shape[0]):
        distance_list = [(j, dist(select_df.iloc[j].tolist(), test_df.iloc[i].tolist())) 
                         for j in range(select_df.shape[0])]   
        sorted_distance_list = sorted(distance_list, key=operator.itemgetter(1))
        top_k_sorted_distance_list = [x[0] for x in sorted_distance_list[:top_k]]
        one_day_prediction = predict_df.iloc[top_k_sorted_distance_list].mean().tolist()
        for ind, column in enumerate(columns):
            predictions[column].extend(one_day_prediction[ind*len(one_day_prediction)/len(columns) : 
                                                          (ind+1)*len(one_day_prediction)/len(columns)])
    return predictions

In [20]:
def dist(a, b):
    return np.mean([float(abs(a[i]-b[i]))/float(a[i] + 1) for i in range(len(a))])

In [21]:
def get_an_interval(interval_str):
    [start_date, end_date] = interval_str.split(",")
    start_date_list = map(int, start_date.split("-"))
    end_date_list = map(int, end_date.split("-"))
    return [start_date_list, end_date_list]

def get_date_intervals(date_intervals):
    if "" in date_intervals: # no date is removed
        return []
    else:
        return map(get_an_interval, date_intervals)

In [22]:
def data_preprocess(input_path, remove_dates, train_test_proportion, time_granularity):
    total_intervals = 24 * 60 / time_granularity
    data = read_data(input_path)
    data_removing_dates = remove_by_date_intervals(data.copy(deep=True), remove_dates)
#     print int(data_removing_dates.shape[0]/total_intervals * train_test_proportion)
    return data_removing_dates, int(data_removing_dates.shape[0]/total_intervals * train_test_proportion) * total_intervals

In [25]:
def KMSP_alpha(data, tollgate_id, direction, k, train_test_split_pos, pred_interval, 
               min_mapping, remove_time_intervals, time_granularity, selection_list,
               prediction_list, feature_list):
    """
    remove_time_intervals: 0 for select, 1 for ad, 2 for am, 3 for pm, 4 for irrelevant
    """
    data = select_by_feature(data.copy(True), feature_list)
    train_data = data.iloc[:train_test_split_pos]
    test_data = data.iloc[train_test_split_pos:]    
    features = [x for x in prediction_list if x not in ["time_window"]]    
    
    train_data_for_tree = remove_by_time_intervals(train_data.copy(deep=True), remove_time_intervals[4])
    
    rt_y = train_data_for_tree["volume"]
    rt_X = train_data_for_tree[features]
    rt = DTR(min_impurity_split = 1)
#     rt = DTR(min_samples_split=5)
    rt.fit(rt_X, rt_y)
    
    train_data_selection = remove_by_time_intervals(train_data.copy(deep=True), remove_time_intervals[0])
    if pred_interval == "AM":
        remove_time_intervals_predict = remove_time_intervals[2]
    elif pred_interval == "PM":
        remove_time_intervals_predict = remove_time_intervals[3]
    else:
        remove_time_intervals_predict = remove_time_intervals[1]
    train_data_prediction = remove_by_time_intervals(train_data.copy(deep=True), remove_time_intervals_predict)
    test_data_feature = remove_by_time_intervals(test_data.copy(deep=True), remove_time_intervals[0])
    test_data_response = remove_by_time_intervals(test_data.copy(deep=True), remove_time_intervals_predict)
    
    test_data_ind_ymd = ind_ymd_mapping(test_data_response)
    data_ind_ymd = ind_ymd_mapping(train_data_selection.copy(True))
    
    selection_columns_by_hm = columns_by_hm(train_data_selection.copy(True), selection_list)
    prediction_columns_by_hm = columns_by_hm(train_data_prediction.copy(True), prediction_list)
    test_columns_by_hm = columns_by_hm(test_data_feature, selection_list)
    
    column_list = test_columns_by_hm.columns.tolist()
    selected_column_list = column_list[:len(column_list)/(len(selection_list)-1)]
    hm_list = [x.split("^")[1] for x in selected_column_list]
    
    output_info = {}
    output_info["time_window"] = format_datetime(hm_list, test_data_ind_ymd, pred_interval, min_mapping, time_granularity)
    prediction_df = pd.DataFrame(get_prediction(selection_columns_by_hm, prediction_columns_by_hm, test_columns_by_hm, k))
    output_info["tollgate_id"] = [tollgate_id] * len(output_info["time_window"])
    output_info["direction"] = [direction] * len(output_info["time_window"])
    output_df = pd.concat([pd.DataFrame(output_info), prediction_df], axis=1)
    rt_pred_real = rt.predict(test_data_response[features]).tolist()
    rt_pred_fake = rt.predict(prediction_df[features]).tolist()
    truth = test_data_response["volume"].tolist()
    mape_fake = np.mean([float(abs(rt_pred_fake[i] - truth[i]))/(truth[i]) for i in range(len(truth))])
    mape_real = np.mean([float(abs(rt_pred_real[i] - truth[i]))/(truth[i]) for i in range(len(truth))])
    return output_df, mape_fake, mape_real, rt

In [24]:
def get_best_k_mape(input_file, pred_interval, time_granularity):
    input_params = open(input_file, "r")
    input_lines = input_params.readlines()
    input_tables_info = [x.split("\n")[0].split(":")[0].split(" ") for x in input_lines]
    remove_date_intervals = [x.split("\n")[0].split(":")[1].split(" ") for x in input_lines]
    remove_dates_list = map(get_date_intervals, remove_date_intervals)
    min_mapping = get_min_mapping(time_granularity)
    feature_list = ["time_window", "m_1", "m_2", "m_3", "has_etc", "volume"]
    selection_list = ["time_window", "m_1", "m_2", "m_3", "has_etc"]
    prediction_list = ["time_window", "volume"]
    remove_time_intervals = get_remove_intervals(time_granularity)
    best_k_pred_mape_rt = []
    dataframes = []
    for i in range(len(input_tables_info)):
        k_pred_mape_rt = []
        data, train_test_split_pos = data_preprocess(input_tables_info[i][0], remove_dates_list[i], 0.75, time_granularity)
        print "%s \n" % input_tables_info[i][0]
        for k in range(1, train_test_split_pos/(24 * 60 / time_granularity) + 1):
            df, error_fake, error_real, rt = KMSP_alpha(data, int(input_tables_info[i][1]), int(input_tables_info[i][2]), 
                                   k, train_test_split_pos, pred_interval, min_mapping, remove_time_intervals, 
                                   time_granularity, selection_list, prediction_list, feature_list)
            print "k = %d, fake MAPE = %f, real MAPE = %f \n" % (k, error_fake, error_real)
            k_pred_mape_rt.append((k, df, error_fake, rt))
        best_k_pred_mape_rt.append(sorted(k_pred_mape_rt, key=operator.itemgetter(2))[0])
    print "Best k-MAPE pairs for all tollgate-direction combinations:"
    for i in range(len(input_tables_info)):
        print input_tables_info[i][0] + ": " + str((best_k_pred_mape_rt[i][0], best_k_pred_mape_rt[i][2])) + "\n"
        dataframes.append(best_k_pred_mape_rt[i][1])
    print "Final MAPE: %f \n" % np.mean([x[2] for x in best_k_pred_mape_rt])
    final_output = pd.concat(dataframes)
#     final_output = final_output[["tollgate_id", "time_window", "direction", "volume"]]
    final_output.to_csv("T2_submission_test.csv", index=False)
    return final_output

In [26]:
get_best_k_mape("input_file_for_look.txt", "AD", 20)

T10ti_20.csv 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


k = 1, fake MAPE = 0.229803, real MAPE = 0.017523 

k = 2, fake MAPE = 0.192480, real MAPE = 0.017523 

k = 3, fake MAPE = 0.180343, real MAPE = 0.017523 

k = 4, fake MAPE = 0.180241, real MAPE = 0.017523 

k = 5, fake MAPE = 0.166010, real MAPE = 0.017523 

k = 6, fake MAPE = 0.170200, real MAPE = 0.017523 

k = 7, fake MAPE = 0.177078, real MAPE = 0.017523 

k = 8, fake MAPE = 0.178830, real MAPE = 0.017523 

k = 9, fake MAPE = 0.174899, real MAPE = 0.017523 

k = 10, fake MAPE = 0.173464, real MAPE = 0.017523 

k = 11, fake MAPE = 0.176772, real MAPE = 0.017523 

k = 12, fake MAPE = 0.177395, real MAPE = 0.017523 

k = 13, fake MAPE = 0.181957, real MAPE = 0.017523 

k = 14, fake MAPE = 0.183743, real MAPE = 0.017523 

T11ti_20.csv 

k = 1, fake MAPE = 0.181268, real MAPE = 0.009329 

k = 2, fake MAPE = 0.176603, real MAPE = 0.009329 

k = 3, fake MAPE = 0.167997, real MAPE = 0.009329 

k = 4, fake MAPE = 0.163997, real MAPE = 0.009329 

k = 5, fake MAPE = 0.156553, real MAPE = 0.0

Unnamed: 0,direction,time_window,tollgate_id,volume
0,0,"[2016-10-13 08:00:00,2016-10-13 08:20:00)",1,51.600000
1,0,"[2016-10-13 08:20:00,2016-10-13 08:40:00)",1,51.800000
2,0,"[2016-10-13 08:40:00,2016-10-13 09:00:00)",1,52.200000
3,0,"[2016-10-13 09:00:00,2016-10-13 09:20:00)",1,49.800000
4,0,"[2016-10-13 09:20:00,2016-10-13 09:40:00)",1,56.200000
5,0,"[2016-10-13 09:40:00,2016-10-13 10:00:00)",1,50.400000
6,0,"[2016-10-13 17:00:00,2016-10-13 17:20:00)",1,44.600000
7,0,"[2016-10-13 17:20:00,2016-10-13 17:40:00)",1,44.200000
8,0,"[2016-10-13 17:40:00,2016-10-13 18:00:00)",1,35.000000
9,0,"[2016-10-13 18:00:00,2016-10-13 18:20:00)",1,29.800000


In [240]:
def KMSP_release(input_path, tollgate_id, direction, k, remove_dates, pred_interval, time_granularity, 
                remove_time_intervals, selection_list, prediction_list, feature_list, min_mapping):
    data = read_data(input_path)
    data_removing_dates = remove_by_date_intervals(data.copy(deep=True), remove_dates)
    data_selection = remove_by_time_intervals(data_removing_dates.copy(deep=True), remove_time_intervals[0])
    
    features = [x for x in prediction_list if x not in ["time_window"]]    
    
    train_data_for_tree = remove_by_time_intervals(data_removing_dates.copy(deep=True), remove_time_intervals[4])
    
    rt_y = train_data_for_tree["volume"]
    rt_X = train_data_for_tree[features]
    rt = DTR(min_samples_split=5)
    rt.fit(rt_X, rt_y)
    
    if pred_interval == "AM":
        remove_time_intervals_predict = remove_time_intervals[2]
    elif pred_interval == "PM":
        remove_time_intervals_predict = remove_time_intervals[3]
    else:
        remove_time_intervals_predict = remove_time_intervals[1]
    data_prediction = remove_by_time_intervals(data_removing_dates.copy(deep=True), remove_time_intervals_predict)
    data_ind_ymd = ind_ymd_mapping(data_selection.copy(True))
    selection_columns_by_hm = columns_by_hm(data_selection.copy(True), selection_list)
    prediction_columns_by_hm = columns_by_hm(data_prediction.copy(True), prediction_list)
    test_data = make_test_data(tollgate_id, direction, time_granularity)
    test_data.to_csv("test_data.csv", index=False)
    test_data_ind_ymd = ind_ymd_mapping(test_data)
    test_columns_by_hm = columns_by_hm(test_data.copy(True), selection_list)
    column_list = test_columns_by_hm.columns.tolist()
    selected_column_list = column_list[:len(column_list)/(len(selection_list)-1)]
    hm_list = [x.split("^")[1] for x in selected_column_list]
    
    output_info = {}
    output_info["time_window"] = format_datetime(hm_list, test_data_ind_ymd, pred_interval, min_mapping, time_granularity)
    prediction_df = pd.DataFrame(get_prediction(selection_columns_by_hm, prediction_columns_by_hm, test_columns_by_hm, k))
    prediction_df.to_csv("for_look.csv", index=False)
    rt_pred = rt.predict(prediction_df[features]).tolist()
    output_info["volume"] = map(int, map(round, rt_pred))
    output_info["tollgate_id"] = [tollgate_id] * len(output_info["time_window"])
    output_info["direction"] = [direction] * len(output_info["time_window"])
    output_df = pd.DataFrame(output_info)
    return output_df, data_ind_ymd

In [232]:
def get_submission(input_file, time_granularity):
    input_params = open(input_file, "r")
    input_lines = input_params.readlines()
    input_tables_info = [x.split("\n")[0].split(":")[0].split(" ") for x in input_lines]
    remove_date_intervals = [x.split("\n")[0].split(":")[1].split(" ") for x in input_lines]
    k_list = [x.split("\n")[0].split(":")[2].split(" ") for x in input_lines]
    remove_dates_list = map(get_date_intervals, remove_date_intervals)
    remove_time_intervals = get_remove_intervals(time_granularity)
    min_mapping = get_min_mapping(time_granularity)
    feature_list = ["time_window", "m_1", "m_2", "m_3", "has_etc", "volume"]
    selection_list = ["time_window", "m_1", "m_2", "m_3", "has_etc"]
    prediction_list = ["time_window", "volume"]
    dataframes = []
    for ind, time_interval in enumerate(["AD"]):
        for i in range(len(input_tables_info)):
            df, data_ind_ymd = KMSP_release(input_tables_info[i][0], 
                                            int(input_tables_info[i][1]), 
                                            int(input_tables_info[i][2]), 
                                            int(k_list[i][ind]), 
                                            remove_dates_list[i], 
                                            time_interval, time_granularity, remove_time_intervals,
                                            selection_list, prediction_list, 
                                            feature_list, min_mapping)
            dataframes.append(df)
    final_output = pd.concat(dataframes)
    final_output = final_output[["tollgate_id", "time_window", "direction", "volume"]]
    final_output.to_csv("T2_submission_test.csv", index=False)
    return final_output

In [244]:
final_result = get_submission("input_file.txt", 20)

[['5'], ['7'], ['3'], ['1'], ['14']]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
