In [4]:
import numpy as np
import pandas as pd
import scipy.stats as stat
import scipy.io as scipio
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime

In [5]:
## read train & test data
data_train = pd.DataFrame(pd.read_csv("data_train.csv"))
data_test = pd.DataFrame(pd.read_csv("data_test.csv"))
data_train = data_train.dropna(0)
data_test = data_test.dropna(0)

## index by visit_date column 
## the dates are not unique yet
data_train = data_train.set_index("visit_date", drop=True)
data_test = data_test.set_index("visit_date", drop=True)

## convert the dates from string to datetime
arima_dates_train = pd.Series(data_train.index.values).\
apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
arima_dates_test = pd.Series(data_test.index.values).\
apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))


Finding the total number of visitors in any area on any given day that is listed in the data. 

In [39]:
unique_dates = sorted(list(set(arima_dates_train))) + sorted(list(set(arima_dates_test)))


In [37]:
## list of area names that correspond to indicator cols
## in the data

area_names = data_train.columns.values[30:]

In [13]:
def area_indicator(data):
    for area in area_names:
        data[area] = data[area].apply(lambda x: int(x != 0))
    return data

In [11]:
date_area_matrix = data_train[area_names]
date_area_matrix["visitors"] = pd.Series(data_train["visitors"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
date_area_bydaygroups = date_area_matrix.groupby\
(date_area_matrix.index).sum()

date_area_bydaygroups = area_indicator(date_area_bydaygroups)

area_date_matrix = np.array(date_area_bydaygroups[area_names].T)

daily_visitors_col = np.array(date_area_bydaygroups["visitors"])
daily_visitors_col = np.array(len(daily_visitors_col)*[daily_visitors_col])

## dim(area x day) matrix where each row entry corresponds
## to a series of the TOTAL number of visitors for each date
## in the index. 
## For example, entry_row(3)_col(4) shows the total number of
## visitors in 3rd area on the 4th day. 

total_area_day = np.dot(area_date_matrix, daily_visitors_col)
total_area_day

array([[  358275.,   609450.,   838100., ...,  5012875.,  3759975.,
         4484175.],
       [       0.,        0.,        0., ...,        0.,        0.,
               0.],
       [  197262.,   335556.,   461448., ...,  2760030.,  2070198.,
         2468934.],
       ..., 
       [  204849.,   348462.,   479196., ...,  2866185.,  2149821.,
         2563893.],
       [  357432.,   608016.,   836128., ...,  5001080.,  3751128.,
         4473624.],
       [  358275.,   609450.,   838100., ...,  5012875.,  3759975.,
         4484175.]])

In [17]:
date_area_matrix_test = data_test[area_names]
date_area_matrix_test["visitors"] = pd.Series(data_test["visitors"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
date_area_bydaygroups_test = date_area_matrix_test.groupby\
(date_area_matrix_test.index).sum()

date_area_bydaygroups_test = area_indicator(date_area_bydaygroups_test)

area_date_matrix_test = np.array(date_area_bydaygroups_test[area_names].T)

daily_visitors_col_test = np.array(date_area_bydaygroups_test["visitors"])
daily_visitors_col_test = np.array(len(daily_visitors_col_test)*[daily_visitors_col_test])

total_area_day_test = np.dot(area_date_matrix_test, daily_visitors_col_test)
total_area_day_test

array([[ 564662.,  553108.,  757794., ...,  612309.,  845721.,  908102.],
       [      0.,       0.,       0., ...,       0.,       0.,       0.],
       [ 564662.,  553108.,  757794., ...,  612309.,  845721.,  908102.],
       ..., 
       [ 564662.,  553108.,  757794., ...,  612309.,  845721.,  908102.],
       [ 564662.,  553108.,  757794., ...,  612309.,  845721.,  908102.],
       [ 564662.,  553108.,  757794., ...,  612309.,  845721.,  908102.]])

In [23]:
data_test.drop("Unnamed: 0.1", 1).shape

(30725, 132)

In [24]:
data_train.shape

(173703, 132)

## Trying with All of the Data

In [26]:
data = pd.concat([data_train, data_test.drop("Unnamed: 0.1", 1)], 0)

In [28]:
data.shape

(204428, 132)

In [29]:
date_area = data[area_names]
date_area["visitors"] = pd.Series(data["visitors"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
date_area_bydays = date_area.groupby\
(date_area.index).sum()

date_area_bydays = area_indicator(date_area_bydays)

area_date = np.array(date_area_bydays[area_names].T)

daily_visitors = np.array(date_area_bydays["visitors"])
daily_visitors = np.array(len(daily_visitors)*[daily_visitors])

## dim(area x day) matrix where each row entry corresponds
## to a series of the TOTAL number of visitors for each date
## in the index. 
## For example, entry_row(3)_col(4) shows the total number of
## visitors in 3rd area on the 4th day. 

area_day_visit = np.dot(area_date, daily_visitors)
area_day_visit

array([[  402954.,   685452.,   942616., ...,  5522334.,  7627446.,
         8190052.],
       [       0.,        0.,        0., ...,        0.,        0.,
               0.],
       [  241941.,   411558.,   565964., ...,  3315711.,  4579659.,
         4917458.],
       ..., 
       [  249528.,   424464.,   583712., ...,  3419688.,  4723272.,
         5071664.],
       [  402111.,   684018.,   940644., ...,  5510781.,  7611489.,
         8172918.],
       [  402954.,   685452.,   942616., ...,  5522334.,  7627446.,
         8190052.]])

In [40]:
area_day_visit_df = pd.DataFrame(area_day_visit, index=area_names,\
                                 columns=unique_dates)

In [43]:
# area_day_visit_df.to_csv("TOTAL_VISITORS_AREA_DATE_FULLDATA.csv")

## Doing the Same for Cuisine Processing 

In [46]:
genre_names = data.columns.values[16:30]

In [47]:
date_genre = data[genre_names]
date_genre["visitors"] = pd.Series(data["visitors"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [48]:
def genre_indicator(data):
    for genre in genre_names:
        data[genre] = data[genre].apply(lambda x: int(x != 0))
    return data

In [49]:
date_genre_bydays = date_genre.groupby\
(date_genre.index).sum()

date_genre_bydays = genre_indicator(date_genre_bydays)

genre_date = np.array(date_genre_bydays[genre_names].T)

daily_visitors = np.array(date_genre_bydays["visitors"])
daily_visitors = np.array(len(daily_visitors)*[daily_visitors])

## dim(genre x day) matrix where each row entry corresponds
## to a series of the TOTAL number of visitors for each date
## in the index. 
## For example, entry_row(3)_col(4) shows the total number of
## visitors in 3rd area on the 4th day. 

genre_day_visit = np.dot(genre_date, daily_visitors)
genre_day_visit

array([[  284934.,   484692.,   666536., ...,  3904914.,  5393466.,
         5791292.],
       [  402954.,   685452.,   942616., ...,  5522334.,  7627446.,
         8190052.],
       [  402954.,   685452.,   942616., ...,  5522334.,  7627446.,
         8190052.],
       ..., 
       [  402954.,   685452.,   942616., ...,  5522334.,  7627446.,
         8190052.],
       [  402954.,   685452.,   942616., ...,  5522334.,  7627446.,
         8190052.],
       [  402111.,   684018.,   940644., ...,  5510781.,  7611489.,
         8172918.]])

In [50]:
genre_day_visit.shape ## great

(14, 478)