# Predict Future Sale - First Look
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 04/08/2020

## Load the data

In [1]:
import pandas as pd

sales_train_df = pd.read_csv("sales_train.csv")
items_df = pd.read_csv("items.csv")
item_categories_df = pd.read_csv("item_categories.csv")
shops_df = pd.read_csv("shops.csv")

### Sales
- date - date in format dd/mm/yyyy
- date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
- shop_id - unique identifier of a shop
- item_id - unique identifier of a product
- item_price - current price of an item
- item_cnt_day - number of products sold. You are predicting a monthly amount of this measure


In [2]:
print(sales_train_df.shape)
sales_train_df.head()

(2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


### Items
- item_name - name of item
- item_id - unique identifier of a product
- item_category_id - unique identifier of item category

In [3]:
print(items_df.shape)
items_df.head()

(22170, 3)


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


### Item Categories
- item_category_name - name of item category
- item_category_id

In [4]:
print(item_categories_df.shape)
item_categories_df.head()

(84, 2)


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


### Shops
- shop_name - name of shop
- shop_id - unique identifier of a shop

In [5]:
print(shops_df.shape)
shops_df.head()

(60, 2)


Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


## Merging the data

In [6]:
train_df = pd.merge(sales_train_df,
                    items_df[["item_id", "item_category_id"]],
                    how="inner",
                    on="item_id")



print("rows before: ", sales_train_df.shape[0])
print("rows after: ", train_df.shape[0])

train_df.head()

rows before:  2935849
rows after:  2935849


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,02.01.2013,0,59,22154,999.0,1.0,37
1,23.01.2013,0,24,22154,999.0,1.0,37
2,20.01.2013,0,27,22154,999.0,1.0,37
3,02.01.2013,0,25,22154,999.0,1.0,37
4,03.01.2013,0,25,22154,999.0,1.0,37


## First Look

In [7]:
train_df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
date,2935849.0,1034.0,28.12.2013,9434.0,,,,,,,
date_block_num,2935850.0,,,,14.5699,9.42299,0.0,7.0,14.0,23.0,33.0
shop_id,2935850.0,,,,33.0017,16.227,0.0,22.0,31.0,47.0,59.0
item_id,2935850.0,,,,10197.2,6324.3,0.0,4476.0,9343.0,15684.0,22169.0
item_price,2935850.0,,,,890.853,1729.8,-1.0,249.0,399.0,999.0,307980.0
item_cnt_day,2935850.0,,,,1.24264,2.61883,-22.0,1.0,1.0,1.0,2169.0
item_category_id,2935850.0,,,,40.0014,17.1008,0.0,28.0,40.0,55.0,83.0


In [8]:
train_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935849 entries, 0 to 2935848
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   date              object 
 1   date_block_num    int64  
 2   shop_id           int64  
 3   item_id           int64  
 4   item_price        float64
 5   item_cnt_day      float64
 6   item_category_id  int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 179.2+ MB


In [17]:
import pandas_profiling as pf

profile = pf.ProfileReport(train_df, explorative=True)

profile.to_file("first_look_profile.html")

profile.to_widgets()

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=21.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…