In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import mean_squared_error, make_scorer
from preprocessing.data_preparation import read_data
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold


# merge items and train data to a dataFrame
def merge_data(input_path, items_path, output_path):
    tdf = read_data(input_path)
    idf = read_data(items_path)
    print('data read successfully!')
    output = Path(output_path)
    if not output.is_file():
        train_merged = tdf.copy()
        train_merged = train_merged.merge(tdf.merge(idf, how='left', on='pid', sort=False))
        pd.to_pickle(train_merged, output_path)
        return train_merged
    else:
        return pd.read_pickle(output_path)


def extract_numbers_from_content(input):
    x_index = input.find('X')
    if input == 'L   125':
        return 1, 1, 125
    if x_index == -1:
        if input == 'PAK':
            return 1, 1, 1
        return 1, 1, input
    second_part = input[x_index + 1: len(input)]
    x_second_index = second_part.find('X')
    if x_second_index == -1:
        return 1, input[0: x_index], second_part
    return input[0: x_index], second_part[0: x_second_index], second_part[x_second_index + 1: len(second_part)]


unit_map = {
    'KG': 1000,
    'ST': 6350,
    'P': 454,
    'M': 100,
    'L': 1000,
    'G': 1,
    'CM': 1,
    'ML': 1,
}


def unit_converter(row):
    return row['content_3'] * unit_map[row['unit']]


def prepare_dataset():
    output = Path('data/unit_fixed.pkl')
    if not output.is_file():
        # example of using merge_data function for train dataset
        mrg = merge_data('data/train.csv', 'data/items.csv', 'data/train_merged.pkl')

        # add count feature (revenue/price)
        mrg['count'] = mrg.revenue / mrg.price

        # uppercase all pharmForm values
        mrg['pharmForm'] = mrg['pharmForm'].str.upper()
        # extract pharmForm values as binary feature and adding them to dataset
        mrg = pd.concat([mrg, pd.get_dummies(mrg['pharmForm'])], axis=1)
        mrg = mrg.drop('pharmForm', 1)

        # split count of packs and amount of each to separate columns
        extracted_numbers = mrg['content'].apply(extract_numbers_from_content)
        extracted_numbers = pd.DataFrame(extracted_numbers.tolist(), columns=['content_1', 'content_2', 'content_3'],
                                         index=extracted_numbers.index)
        extracted_numbers['content_1'] = pd.to_numeric(extracted_numbers['content_1'])
        extracted_numbers['content_2'] = pd.to_numeric(extracted_numbers['content_2'])
        extracted_numbers['content_3'] = pd.to_numeric(extracted_numbers['content_3'])
        mrg = pd.concat([mrg, extracted_numbers], axis=1)
        mrg = mrg.drop('content', 1)

        mrg['content_3'] = mrg.apply(unit_converter, axis=1)
        mapping = {'KG': 'G', 'ST': 'G', 'P': 'G', 'L': 'ML', 'M': 'CM'}
        mrg = mrg.replace({'unit': mapping})
        pd.to_pickle(mrg, '../data/unit_fixed.pkl')
        print('units converted')
    else:
        mrg = pd.read_pickle('data/unit_fixed.pkl')

    # fill campaignIndex with D and then get dummy binary values of each category index
    # mrg['campaignIndex'].fillna('D', inplace=True)
    # mrg = pd.concat([mrg, pd.get_dummies(mrg['campaignIndex'])], axis=1)

    # mrg = pd.concat([mrg, pd.get_dummies(mrg['group'])], axis=1)
    return mrg


def predict_competitor(all_data):
    train = all_data[pd.notnull(all_data['competitorPrice'])]
    kf = KFold(n_splits=10)
    estimator = XGBRegressor()
    x = train.drop('competitorPrice', 1)
    y = train['competitorPri1ce']
    scores = cross_val_score(estimator,
                             x,
                             y,
                             cv=kf,
                             scoring=make_scorer(mean_squared_error))
    print(scores)


data = prepare_dataset()
# from scipy.stats import pearsonr
#
# print(data['category'].fillna(0))
# print(pearsonr(data['category'].fillna(0), data['count']))


# TODO handle features: category, group, competitor
# TODO Random Forrest on server

In [26]:
campaign_missing = data[pd.isnull(data['campaignIndex'])]['lineID']
adFlag_missing = data[data['adFlag'] == 0]['lineID']
# print(campaign_missing)
# print(data['adFlag'])

In [26]:
print(len(campaign_missing))
print(len(adFlag_missing))

2287968


1880176


In [29]:
intersections = pd.Series(
    list(set(campaign_missing).intersection(
        set(adFlag_missing))))
print(len(intersections))

1651555


In [37]:
# To be filled with similar product campaign Index
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='newton-cg',
                        multi_class='multinomial')
train_data = data[pd.notnull(data['campaignIndex'])]
lr.fit(train_data[['pid', 'manufacturer']],
       train_data['campaignIndex'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [38]:
# Predicting the 600K part
pred = lr.predict(data[pd.isnull(
    data['campaignIndex'])][['pid', 'manufacturer']])

In [41]:
pred = lr.predict(data[pd.notnull(data['campaignIndex'])][['pid', 'manufacturer']])
print(pd.Series(pred).unique())
# Not good enough, trying naive bayes

['B']


In [31]:
# print(intersections)
# These are lineIDs with missing campaignIndex and adFlag=0
print(len(intersections))
ind = data.lineID.isin(intersections.tolist())
print(len(data[ind]['campaignIndex']))
print(data['campaignIndex'])

1651555


1651555
0          NaN
1            C
2          NaN
3          NaN
4          NaN
5          NaN
6          NaN
7          NaN
8          NaN
9          NaN
10           C
11         NaN
12         NaN
13         NaN
14         NaN
15           A
16           B
17         NaN
18         NaN
19         NaN
20         NaN
21         NaN
22         NaN
23         NaN
24           B
25         NaN
26         NaN
27         NaN
28         NaN
29         NaN
          ... 
2755973    NaN
2755974    NaN
2755975    NaN
2755976    NaN
2755977    NaN
2755978    NaN
2755979    NaN
2755980    NaN
2755981    NaN
2755982    NaN
2755983    NaN
2755984    NaN
2755985    NaN
2755986    NaN
2755987      B
2755988    NaN
2755989    NaN
2755990    NaN
2755991      B
2755992    NaN
2755993    NaN
2755994    NaN
2755995    NaN
2755996    NaN
2755997    NaN
2755998    NaN
2755999    NaN
2756000    NaN
2756001      A
2756002      A
Name: campaignIndex, dtype: object


In [32]:
# To be filled with D
data['campaignIndex']. \
    fillna(data[ind]['campaignIndex'].fillna('D'), inplace=True)
print(data['campaignIndex'])

0            D
1            C
2            D
3          NaN
4            D
5            D
6          NaN
7            D
8            D
9          NaN
10           C
11         NaN
12           D
13           D
14           D
15           A
16           B
17           D
18           D
19           D
20           D
21           D
22           D
23           D
24           B
25           D
26           D
27           D
28         NaN
29           D
          ... 
2755973      D
2755974      D
2755975      D
2755976      D
2755977      D
2755978    NaN
2755979      D
2755980      D
2755981      D
2755982      D
2755983      D
2755984      D
2755985    NaN
2755986      D
2755987      B
2755988      D
2755989      D
2755990      D
2755991      B
2755992      D
2755993      D
2755994      D
2755995      D
2755996      D
2755997    NaN
2755998      D
2755999      D
2756000      D
2756001      A
2756002      A
Name: campaignIndex, dtype: object


In [34]:
# Filling the rest with naive bayes prediction
train_data = data[pd.notnull(data['campaignIndex'])]
test_data = data[pd.isnull(data['campaignIndex'])]

In [39]:
print(len(train_data))
print(len(test_data))
from sklearn.naive_bayes import GaussianNB

naive_bayes_clf = GaussianNB()
naive_bayes_clf.fit(train_data[['pid', 'manufacturer', 'rrp']],
                    train_data['campaignIndex'])
predictions = naive_bayes_clf.predict(
    test_data[['pid', 'manufacturer', 'rrp']])

2119590
636413


In [57]:
data.ix[data['lineID'].isin(test_data['lineID']),
        'campaignIndex'] = predictions
print(len(data[pd.isnull(data['campaignIndex'])]))
# campaignIndex filled completely

0
