# pre-process the click fraud data for ffm format

In [None]:
import pandas as pd
import numpy as np

In [None]:
dtypes = {"ip": "uint32",
          "app": "uint16",
          "device": "uint16",
          "os": "uint16",
          "channel": "uint16",
          "is_attributed": "uint8"}
good_cols = [0, 1, 2, 3, 4, 5, 7]

In [None]:
raw_test = pd.read_csv("data/test.csv.zip",
                        index_col=0,
                        dtype=dtypes,
                        infer_datetime_format=True,
                        parse_dates=["click_time"])


# Date stuff.
raw_test["year"] = raw_test.click_time.dt.year.astype("uint8")
raw_test["month"] = raw_test.click_time.dt.month.astype("uint8")
raw_test["day"] = raw_test.click_time.dt.day.astype("uint8")
raw_test["dayofweek"] = raw_test.click_time.dt.dayofweek.astype("uint8")
raw_test["hour"] = raw_test.click_time.dt.hour.astype("uint8")
raw_test["minute"] = raw_test.click_time.dt.minute.astype("uint8")
raw_test["second"] = raw_test.click_time.dt.second.astype("uint8")

raw_test.drop(columns="click_time", inplace=True)

raw_test.head()

In [None]:
raw_train = pd.read_csv("data/train.csv.zip",
                        nrows=25000000,
                        usecols=good_cols,
                        dtype=dtypes,
                        infer_datetime_format=True,
                        parse_dates=["click_time"])

# Date stuff.
raw_train["year"] = raw_train.click_time.dt.year.astype("uint8")
raw_train["month"] = raw_train.click_time.dt.month.astype("uint8")
raw_train["day"] = raw_train.click_time.dt.day.astype("uint8")
raw_train["dayofweek"] = raw_train.click_time.dt.dayofweek.astype("uint8")
raw_train["hour"] = raw_train.click_time.dt.hour.astype("uint8")
raw_train["minute"] = raw_train.click_time.dt.minute.astype("uint8")
raw_train["second"] = raw_train.click_time.dt.second.astype("uint8")

raw_train.drop(columns="click_time", inplace=True)

print(raw_train.groupby("is_attributed")["is_attributed"].sum())

# Re-order the training set columns
new_cols = list(raw_test.columns)
new_cols.append("is_attributed")
raw_train = raw_train[new_cols]

raw_train.head()

In [None]:
raw_train.info(memory_usage="deep")
raw_test.info(memory_usage="deep")

In [None]:
# We need to convert into annoying libFFM format!
# based on: https://gist.github.com/NhuanTDBK/14989f19f450c8ad675d52e8452517ad

# For every unique value of every field, create a hash
# and then a sequential number in a dictionary.
hash_dict = {}

i = 0
for colname in raw_test.columns:  
    test_values = raw_test[colname].unique()
    train_values = raw_train[colname].unique()
       
    all_values = set(np.concatenate((test_values, train_values)))
    
    for value in all_values:
        hash_dict[hash(colname + str(value))] = i
        i = i+1
    
# For every line in the test/train, replace with the correct values.

In [None]:
# Now we have the hash dict:
trainfile = "train_libffm.txt"
testfile = "test_libffm.txt"

# This is going to be slow.
with open(trainfile, "w") as training_output_file:
    for rowtuple in raw_train.itertuples():
        output_string = []
        for i, name in enumerate(raw_test.columns):
            output_string.append("{}:{}:1".format(i, hash_dict[hash(name + str(getattr(rowtuple, name)))]))
        training_output_file.write(str(rowtuple.is_attributed)+ " " + " ".join(output_string) + "\n")
        
        
with open(testfile, "w") as testing_output_file:
    for rowtuple in raw_test.itertuples():
        output_string = []
        for i, name in enumerate(raw_test.columns):
            output_string.append("{}:{}:1".format(i, hash_dict[hash(name + str(getattr(rowtuple, name)))]))
        testing_output_file.write("1 " + " ".join(output_string) + "\n")