# Model to attempt to detect click fraud.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

In [None]:
dtypes = {"ip": "uint32",
          "app": "uint16",
          "device": "uint16",
          "os": "uint16",
          "channel": "uint16",
          "is_attributed": "uint8"}
good_cols = [0, 1, 2, 3, 4, 5, 7]

raw_train = pd.read_csv("data/train.csv.zip",
#raw_train = pd.read_csv("data/train_sample.csv.zip",
                        nrows=60000000,
                        usecols=good_cols,
                        dtype=dtypes,
                        infer_datetime_format=True,
                        parse_dates=["click_time"])
raw_train["timestamp"] = raw_train.click_time.astype("int64") // 10 ** 9
raw_train.drop(columns=["click_time"], inplace=True)
print(raw_train.groupby("is_attributed")["is_attributed"].sum())
raw_train.head()

In [None]:
raw_test = pd.read_csv("data/test.csv.zip",
                        index_col=0,
                        dtype=dtypes,
                        infer_datetime_format=True,
                        parse_dates=["click_time"])
raw_test["timestamp"] = raw_test.click_time.astype("int64") // 10 ** 9
raw_test.drop(columns=["click_time"], inplace=True)
raw_test.head()

In [None]:
raw_train.info(memory_usage="deep")
raw_test.info(memory_usage="deep")

# Have a go at some XGBoost stuff!

In [None]:
# Model data
labels = raw_train["is_attributed"].values
raw_train.drop(columns="is_attributed", inplace=True)
dtrain = xgb.DMatrix(pd.get_dummies(raw_train), label=labels)
dtest = xgb.DMatrix(pd.get_dummies(raw_test))

In [None]:
del(raw_train)

In [None]:
# Parameters
params = {"booster": "gbtree",
          "silent": 0,
          "eta": 0.2,
          "eval_metric": "auc",
          "objective": "binary:logistic"}
num_rounds = 100

In [None]:
# Cross-validation test
cv_results = xgb.cv(params, dtrain,
                    num_boost_round=num_rounds,
                    verbose_eval=20,
                    nfold=3)

In [None]:
fraud_model = xgb.train(params, dtrain,
                        num_boost_round=num_rounds)

In [None]:
result = fraud_model.predict(dtest)

In [None]:
scored_output = pd.DataFrame({"is_attributed": result},
                            index=raw_test.index)
scored_output.head()

In [None]:
# Write out for submission
scored_output.to_csv("talking_data.gz",
                     compression="gzip")