In [1]:
import numpy as np
import pandas as pd

TRAIN_FILE = "/kaggle/input/data-for-codeproject-contest/our_contest_train_2.csv"
TEST_FILE = "/kaggle/input/data-for-codeproject-contest/our_contest_test.csv"

In [2]:
from datetime import datetime

def clean_format(date):
    year, month, day = date.split("-")
    date = month + "/" + day + "/" + year
    date += " 00:00:00"
    return date  

def convert2datetime(df, col):
    dateTimes = []
    for timeStamp in df[col]:
        try:
            dateTimes.append(datetime.strptime(timeStamp, '%m/%d/%Y %H:%M:%S'))
        except:
            dateTimes.append(datetime.strptime(clean_format(timeStamp), '%m/%d/%Y %H:%M:%S'))
    return dateTimes

def calc_account_age(df, post_time, account_create_time):
    account_creation_times = convert2datetime(df, account_create_time)
    post_creation_times = convert2datetime(df, post_time)
    
    df['accountAge'] = [(post_create - acc_create).days for acc_create, post_create in list(zip(account_creation_times, post_creation_times))]
    
    return df.drop([account_create_time, post_time], axis = 1)

def make_col_positive(df, col):
    df[col] = df[col] + abs(df[col].min())
    return df

def combine_cols(df, col1, col2, col3, col4, col5, col6, new_col):
    df[new_col] = [val1 + " " + val2 + " " + val3 + " " + val4 + " " + val5 + " " + val6 for val1, val2, val3, val4, val5, val6 in list(zip(list(df[col1]), list(df[col2]), list(df[col3]), list(df[col4]), list(df[col5]), list(df[col6])))]
    return df.drop([col1, col2, col3, col4, col5, col6], axis = 1)

def replace_nulls(df, replace_by):
    return df.fillna(replace_by)

In [3]:
def preprocess_data(df):
    df = calc_account_age(df, 'PostDateTime', 'WhenAccountMade')
    df = make_col_positive(df, 'Karma')
    df = replace_nulls(df, "~")
    df = combine_cols(df, 'Heading', 'MainText', 'PrimarySubject', 'SecondarySubject', 'TertiarySubject', 'OtherSubject', 'PostText')
    return df

In [4]:
train_df = pd.read_csv(TRAIN_FILE)

train_df = train_df.drop_duplicates()

train_df = preprocess_data(train_df)
train_df = train_df.drop('Qid', axis = 1)

train_df.to_pickle('train_pickle.pkl')

In [5]:
test_df = pd.read_csv(TEST_FILE)

test_df = preprocess_data(test_df)
qids = test_df['Qid']
test_df = test_df.drop('Qid', axis = 1).to_pickle('test_pickle.pkl')

soln_df = pd.DataFrame()
soln_df['Qid'] = qids
soln_df.to_pickle('soln_pickle.pkl')