In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.preprocessing
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
items = pd.read_csv('/kaggle/input/predict-future-sales-translated-dataset/items_en.csv')
item_categories= pd.read_csv('/kaggle/input/predict-future-sales-translated-dataset/item_categories_en.csv')
shops =pd.read_csv('/kaggle/input/predict-future-sales-translated-dataset/shops_en.csv')

In [None]:
sales_train.head()

In [None]:
test.head()

In [None]:
sales_train.describe()

In [None]:
test.info()

In [None]:
sales_train.info()

In [None]:
#try to reduce cost of the numerical features
l = ['date_block_num','shop_id','item_id','item_price','item_cnt_day']

for feature in l:
    if sales_train[feature].dtype == 'int64':
        sales_train[feature] =sales_train[feature].astype('int32')
    else:
        sales_train[feature] =sales_train[feature].astype('float32')

sales_train.info()

In [None]:
#we reduce 100 mb of ram usage without loosing information
plt.figure(figsize=(15,8))

sns.countplot(x= 'shop_id',data=sales_train)

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x= 'date_block_num',data=sales_train)

In [None]:
#at the end of the year there is an increases in the data set. (In december)
plt.figure(figsize=(15,8))
sales_train['item_price'].hist(bins=100)

In [None]:
plt.figure(figsize=(15,8))
sales_train['item_cnt_day'].hist(bins=50)

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x=sales_train['item_price'])

In [None]:
#it is an interesting situation beacuse number of item sold is cannot be a negative number, maybe they returned item but it doesn't seem at all. Thus i drop them.
sales_train= sales_train.drop(sales_train[sales_train['item_cnt_day'] <0].index)
sales_train

In [None]:
sales_train = sales_train[sales_train['item_cnt_day'] <1000]
sales_train = sales_train[sales_train['item_price'] <100000]

In [None]:
sales_train


In [None]:
#checking other tables
item_categories[:40]

In [None]:
len(item_categories)

In [None]:

#it seems that there is a grouping categories between them thus i want to create a feature according to them
upper_category = list(item_categories['item_category_name'])
for i in range(len(item_categories)):
    if 'accessorie' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='accessories'
    elif 'game' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='games'
    elif 'card' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='card'
    elif 'console' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='consoles'
    elif 'movie' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='movies'
    elif 'book' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='books'
    elif 'music' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='music'
    elif 'gift' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='gifts'
    elif 'program' in item_categories['item_category_name'][i].lower():
        upper_category[i] ='program'
    else:
        upper_category[i] = 'else'

item_categories['upper_category']=upper_category
item_categories.loc[24,'upper_category'] ='games'
item_categories.loc[25,'upper_category'] ='games'
item_categories.loc[32,'upper_category'] ='card'
item_categories.loc[50,'upper_category'] ='books'
item_categories.loc[64,'upper_category'] ='gifts'
item_categories.loc[65,'upper_category'] ='gifts'
item_categories.loc[66,'upper_category'] ='gifts'
item_categories.loc[10:17,'upper_category']='consoles'

In [None]:
item_categories.head()

In [None]:
encoder =sklearn.preprocessing.LabelEncoder()


In [None]:
items

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
items['item_name'] = items['item_name'].str.replace('[^\w\s]','')
items

In [None]:
vect = TfidfVectorizer(stop_words='english')
x = vect.fit_transform(items['item_name'])
k = items['item_category_id'].nunique()
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(x)

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vect.get_feature_names()
for i in range(k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

In [None]:
order_centroids.shape

In [None]:
items = items.merge(item_categories,how ='inner',on='item_category_id')

items

In [None]:


items['upper_category'] = encoder.fit_transform(items['upper_category'])


In [None]:
items = items.drop(columns=['item_name','item_category_name','item_category_id'])
items

In [None]:
shops

In [None]:
import string
shops['city'] = shops['shop_name'].str.replace('[^\w\s]','')
city = shops['city'].str.split(' ')
city
#every shop name start with city names thus we can create a dummy variable too

In [None]:
city_names = []
for i in city:
    if len(i[0]) <3:
        city_names.append(i[1])
    else:
        city_names.append(i[0])


In [None]:
city_names =encoder.fit_transform(city_names)
city_names =pd.Series(city_names,name='encoded_cities')

In [None]:
shops =pd.concat([shops,city_names],axis=1)
shops

In [None]:
shops =shops.drop(columns=['shop_name','city'])
shops

In [None]:
train_unique = sales_train['shop_id'].unique()
test_unique = test['shop_id'].unique()
diff = set(train_unique)- set(test_unique)
diff

In [None]:
items

In [None]:
for i in diff:
    sales_train = sales_train.drop(sales_train[sales_train['shop_id']==i].index)
sales_train

In [None]:
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [None]:
sales_train

In [None]:
sales_train

In [None]:
pivot = sales_train.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day',aggfunc='sum').fillna(0.0)
pivot = pivot.reset_index()
pivot

In [None]:
test = test.merge(pivot,how= 'left',on=['shop_id','item_id'] ).fillna(0)
test 

In [None]:
pivot = pivot.merge(shops,how= 'inner',on = 'shop_id')
pivot =pivot.merge(items,how='inner',on='item_id')
pivot

In [None]:
test = test.merge(items,how='inner',on='item_id')
test = test.merge(shops,how='inner',on='shop_id')
test

In [None]:
corr =pivot.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
param = {'max_depth':15, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.50, 
         'num_round':2000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}
xgbtrain = xgb.DMatrix(pivot.iloc[:,  (pivot.columns != 33)].values,pivot.iloc[:,pivot.columns == 33].values)
watchlist  = [(xgbtrain,'train-rmse')]

bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(pivot.iloc[:,  (pivot.columns != 33)].values))
from sklearn.metrics import mean_squared_error 
rmse = np.sqrt(mean_squared_error(preds,pivot.iloc[:, pivot.columns == 33].values))
print(rmse)

In [None]:
preds = bst.predict(xgb.DMatrix(test.iloc[:,  (test.columns != 33)].values))

rmse = np.sqrt(mean_squared_error(preds,test.iloc[:, test.columns == 33].values))
print(rmse)



In [None]:
preds = list(map(lambda x: min(20,max(x,0)), list(preds)))

In [None]:
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(bst, (10,14))


In [None]:
sub = pd.DataFrame({'ID':test.index,'item_cnt_month':preds})

In [None]:
sub.to_csv('sub.csv',index=False)