# AI Community Innopolis #1, Fall 2018
### Contest 1: Predict which items the customer will buy

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools as it

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

np.random.seed(42)
sns.set()
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Let's read the data.

In [3]:
train_data = pd.read_csv('data/train_data.csv',
                         parse_dates=['rec_date'])
test_data = pd.read_csv('data/test_data.csv',
                        parse_dates=['rec_date'])
recipts_history = pd.read_csv('data/receipts_history.csv',
                              parse_dates=['rec_date'])
categories = pd.read_csv('data/categories.csv')

In [4]:
train_data.head(2)

Unnamed: 0,user_id,shop_geo_lat,shop_geo_lon,rec_date,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,...,cat_15,cat_16,cat_17,cat_18,cat_19,cat_20,cat_21,cat_22,cat_23,cat_24
0,21303,50.576368,36.582109,2018-09-06 11:49:00,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,18137,59.414255,56.85488,2018-07-31 13:13:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
test_data.head(3)

Unnamed: 0,user_id,shop_geo_lat,shop_geo_lon,rec_date
0,21303,50.62755,36.568998,2018-09-06 17:01:00
1,22638,59.724899,30.406318,2018-09-05 21:07:00
2,21309,58.977102,43.1341,2018-08-16 12:59:00


In [6]:
recipts_history.head(3)

Unnamed: 0,user_id,rec_id,shop_geo_lat,shop_geo_lon,rec_date,rec_total,cat_0,cat_1,cat_2,cat_3,...,cat_15,cat_16,cat_17,cat_18,cat_19,cat_20,cat_21,cat_22,cat_23,cat_24
0,21303,32606,50.560819,36.569266,2018-08-21 11:24:00,2399000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21303,32605,50.62755,36.568998,2018-08-22 17:11:00,15700,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,21303,32603,50.555208,36.561561,2018-08-23 09:02:00,100000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
target_column_names = list(set(train_data.columns) - set(test_data.columns))

X_train = train_data.drop(columns=target_column_names)
X_test = test_data

y_train = train_data[target_column_names]

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, 
                                                      test_size=0.3,
                                                      random_state=42)

In [None]:
X_train.head(3)

---
#### EDA
Let's perform an exploratory data analysis for the columns `rec_date`, `shop_geo_lat` and `shop_geo_lon`.

In [None]:
X = pd.concat([X_train, X_valid, X_test], axis='rows')

In [None]:
plt.figure(figsize=(11, 6))
plt.xlim([25, 70])
plt.xlabel('Longitude')
plt.ylabel('Latilude')
plt.ylim([42, 70])
plt.scatter(X.shop_geo_lon, X.shop_geo_lat, s=3, alpha=0.5);

In [None]:
rec_dates = X.query('rec_date > "2017-09"').rec_date

rec_dates.hist(bins=len(np.unique(list(zip(rec_dates.dt.month,
                                           rec_dates.dt.day)), axis=0)),
               figsize=(13, 5));

---
#### Preprocessing

In [None]:
def generate_features(df):
    df['rec_hours'] = df.rec_date.dt.hour
    df['rec_minutes'] = df.rec_date.dt.minute
    df['rec_minutes_from_midnight'] = (
        df.rec_hours * 60 + df.rec_minutes
    )
    df['rec_month'] = df.rec_date.dt.month
    df['rec_dayofweek'] = df.rec_date.dt.dayofweek
    
    df['timestamp'] = (df.rec_date - pd.Timestamp("1970-01-01")) \
                        // pd.Timedelta('1s')
    
    df = df.drop(columns='rec_date')
    return df

In [None]:
X_train = generate_features(X_train)
X_valid = generate_features(X_valid)
X_test = generate_features(X_test)

In [None]:
X_train.head(3)

---
#### Prediction

In [None]:
regressor = RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
mean_squared_error(regressor.predict(X_valid), y_valid)

#### Create submission

In [None]:
prediction = pd.DataFrame(regressor.predict(X_test),
                          columns=target_column_names)
prediction['user_id'] = X_test.user_id
prediction.to_csv('sample_submission.csv', index=None)