# AI Community @ Семинар  №3
## Домашнее задание 2
### Базовое решение конкурса

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
# Считаем данные
train = pd.read_csv('data/data-train.csv', parse_dates=['Timestamp'])
test = pd.read_csv('data/data-test.csv', parse_dates=['Timestamp'])

In [3]:
train.head(3)

Unnamed: 0,ID,Timestamp,Location,Category
0,152187,2016-03-30 12:18:52.537,8,0
1,322276,2016-09-07 17:43:37.960,6,1
2,306252,2016-08-28 09:41:10.850,1,1


In [4]:
test.head(3)

Unnamed: 0,ID,Location,Timestamp
0,662032,6,2017-08-25 12:02:07.603
1,527522,6,2017-03-11 08:39:41.213
2,443572,6,2016-12-11 13:30:50.660


In [5]:
# Вычленим признаки
train_hour = train.Timestamp.dt.hour
train_minute = train.Timestamp.dt.minute

test_hour = test.Timestamp.dt.hour
test_minute = test.Timestamp.dt.minute

In [6]:
train_features = pd.DataFrame({
    'hour': train_hour,
    'minute': train_minute
})

test_features = pd.DataFrame({
    'hour': test_hour,
    'minute': test_minute
})

In [7]:
# Обучим линейную регрессию
clf = LinearRegression()

clf.fit(train_features, train.Category)
regression_result = clf.predict(test_features)
predicted_categories = (regression_result > np.median(regression_result)).astype(int)

In [8]:
# Сохраниим предсказания
pd.DataFrame({
    'id': test.ID,
    'category': predicted_categories
}).to_csv('submission.csv', index=False)

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDRegressor

In [10]:
def my_metric(y_true, y_pred):
    y_pred = (y_pred > np.median(y_pred)).astype(int)
    return accuracy_score(y_true, y_pred)

## kaggle 0.68 score solution

In [11]:
msk = np.random.rand(len(train)) < 0.8
trn = train[msk]
tst = train[~msk]

trn_feat = pd.DataFrame({
    'month': trn.Timestamp.dt.month,
    'location': trn.Location
})
tst_feat = pd.DataFrame({
    'month': tst.Timestamp.dt.month,
    'location': tst.Location
})

In [12]:
reg = LinearRegression()
# gives ~0.69

# reg = LogisticRegression()
# gives ~0.67

# reg = LogisticRegression(class_weight={0: 0.605649, 1: 0.394351})
# gives ~0.63

# reg = SGDRegressor()
# gives ~0.63-0.69

reg.fit(trn_feat, trn.Category)
res = reg.predict(tst_feat)
pred = (res > np.median(res)).astype(int)
# pred = res

accuracy_score(tst.Category, pred)

0.69710424129315796

In [13]:
cross_val_score(reg, trn_feat, trn.Category, scoring=make_scorer(my_metric))

array([ 0.69846359,  0.69695875,  0.69844889])

## kaggle 0.62 score solution

In [14]:
import itertools as it
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return it.chain.from_iterable(it.combinations(s, r) for r in range(len(s)+1))

# make all feature combinations
features = list(powerset(['time', 'day', 'hour', 'id', 'location', 'minute', 'minutes', 'month', 'months', 'sec', 'year']))[1:]

In [15]:
msk = np.random.rand(len(train)) < 0.8
trn = train[msk]
tst = train[~msk]

trn_feat = pd.DataFrame({
    'time': trn.Timestamp.astype(np.int64) // 10 ** 6,
    'hour': trn.Timestamp.dt.hour,
    'minute': trn.Timestamp.dt.minute,
    'minutes': trn.Timestamp.dt.hour * 60 + trn.Timestamp.dt.minute,
    'day': trn.Timestamp.dt.day,
    'month': trn.Timestamp.dt.month,
    'months': trn.Timestamp.dt.year * 12 + trn.Timestamp.dt.month,
    'sec': trn.Timestamp.dt.second,
    'year': trn.Timestamp.dt.year,
    'location': trn.Location,
    'id': trn.ID
})
tst_feat = pd.DataFrame({
    'time': tst.Timestamp.astype(np.int64) // 10 ** 6,
    'hour': tst.Timestamp.dt.hour,
    'minute': tst.Timestamp.dt.minute,
    'minutes': tst.Timestamp.dt.hour * 60 + tst.Timestamp.dt.minute,
    'day': tst.Timestamp.dt.day,
    'month': tst.Timestamp.dt.month,
    'months': tst.Timestamp.dt.year * 12 + tst.Timestamp.dt.month,
    'sec': tst.Timestamp.dt.second,
    'year': tst.Timestamp.dt.year,
    'location': tst.Location,
    'id': tst.ID
})

In [16]:
comb_feat = []
for fs in features:
    rg = LinearRegression(n_jobs=4)
    # gives ~0.72 with ('time', 'day', 'location', 'minute', 'month', 'months') 
    # or ('time', 'day', 'location', 'month', 'months')

#     rg = LogisticRegression(n_jobs=4)
    # gives ~0.67 with ('day', 'location', 'minute', 'month', 'months')
    
#     rg = LogisticRegression(n_jobs=4, class_weight={0: 0.605649, 1: 0.394351})
    # gives ~0.64 with ('location', 'minute', 'minutes', 'month', 'year')

#     rg = SGDRegressor()
    # gives ~0.69 with ('location', 'month')

    rg.fit(trn_feat.loc[:, fs], trn.Category)
    rs = rg.predict(tst_feat.loc[:, fs])
    prd = (rs > np.median(rs)).astype(int)
#     prd = rs
    comb_feat.append(accuracy_score(tst.Category, prd))

In [17]:
amax = np.array(comb_feat).argmax()
print(amax, max(comb_feat))
print(features[amax])

1118 0.728304375853
('time', 'day', 'location', 'minute', 'month', 'months')


In [18]:
f = ['time', 'day', 'location', 'month', 'months']

reg = LinearRegression()

reg.fit(trn_feat.loc[:, f], trn.Category)
res = reg.predict(tst_feat.loc[:, f])
pred = (res > np.median(res)).astype(int)

cross_val_score(reg, trn_feat.loc[:, f], trn.Category, scoring=make_scorer(my_metric))

array([ 0.72650123,  0.7247836 ,  0.72582322])

### "Просто заванхотил все фичи" © Николай Прокопцев

In [19]:
trn_feat = pd.get_dummies(pd.DataFrame({
    'time': train.Timestamp.astype(np.int64) // 10 ** 6,
    'month': train.Timestamp.dt.month,
    'location': train.Location,
}), columns=['location', 'month'])

tst_feat = pd.get_dummies(pd.DataFrame({
    'time': test.Timestamp.astype(np.int64) // 10 ** 6,
    'month': test.Timestamp.dt.month,
    'location': test.Location
}), columns=['location', 'month']).drop(['location_9', 'location_11'], 1)

In [20]:
reg = LinearRegression()
# gives ~0.80 with ['time' 'month' 'location'] and  (month, location) - one-hotted

# reg = LogisticRegression()
# gives ~0.81 with ['day', 'month', 'year', 'location'] and all are one-hotted

reg.fit(trn_feat, train.Category)
res = reg.predict(tst_feat)
pred = (res > np.median(res)).astype(int)
# pred = res

In [21]:
cross_val_score(reg, trn_feat, train.Category, cv=5, scoring=make_scorer(my_metric))
# cross_val_score(reg, trn_feat, train.Category, cv=5)

array([ 0.80568349,  0.81040971,  0.81310395,  0.80345585,  0.81335701])