## Implementation of Feature Transform Suggested in EDA.ipynb

## Imports and Reading in Relevant Data

In [None]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.csv")
dev = pd.read_csv("data/dev.csv")

In [3]:
print(len(train))
train.head()

250874


Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...


In [4]:
print(len(dev))
dev.tail()

35918


Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
35913,358855,161115,349,3.0,0,2014-12-15,"Okay, so I gotta repeat the chorus here and st..."
35914,358859,161116,349,2.0,0,2014-12-10,"The pizza is delicious, but it's SO loud here,..."
35915,358884,161125,349,5.0,0,2014-09-05,Emily has hands down the best pizza I've had i...
35916,358894,1423,349,5.0,0,2014-07-13,I'm not entirely sure who came up with the nam...
35917,358935,161140,349,5.0,0,2014-03-04,My date and I had an amazing time here. The s...


## Combining the Data

Because there is overlap between users in train and dev, the datasets must be combined to properly implement a <i> reviewsToDate </i> column. If a user shows up in both sets, we don't want to re-increment the # of reviews we've seen up to the date of the new instance in the dev set. Once the transformation is complete, the combined dataset will be split back into train and dev, since the <i> ex_id </i> column is unique across the concatenation of the datasets.

In [17]:
train_ex = list(train.ex_id)
dev_ex = list(dev.ex_id)

In [18]:
combined = pd.concat([train,dev],axis=0)

In [19]:
combined.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...


In [20]:
combined.tail()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
35913,358855,161115,349,3.0,0,2014-12-15,"Okay, so I gotta repeat the chorus here and st..."
35914,358859,161116,349,2.0,0,2014-12-10,"The pizza is delicious, but it's SO loud here,..."
35915,358884,161125,349,5.0,0,2014-09-05,Emily has hands down the best pizza I've had i...
35916,358894,1423,349,5.0,0,2014-07-13,I'm not entirely sure who came up with the nam...
35917,358935,161140,349,5.0,0,2014-03-04,My date and I had an amazing time here. The s...


In [21]:
len(combined) == (len(train)+len(dev))

True

## Implementation

In [22]:
copy = combined.copy()
copy.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...


In [23]:
copy.tail()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
35913,358855,161115,349,3.0,0,2014-12-15,"Okay, so I gotta repeat the chorus here and st..."
35914,358859,161116,349,2.0,0,2014-12-10,"The pizza is delicious, but it's SO loud here,..."
35915,358884,161125,349,5.0,0,2014-09-05,Emily has hands down the best pizza I've had i...
35916,358894,1423,349,5.0,0,2014-07-13,I'm not entirely sure who came up with the nam...
35917,358935,161140,349,5.0,0,2014-03-04,My date and I had an amazing time here. The s...


In [29]:
def engineered_df(df):
    rolling_rev = []
    user_dict = {}
    for index,row in df.iterrows():
        curr_date = row['date']
        curr_user = row['user_id']
        
        if(curr_user not in user_dict):
            dates = df.loc[df.user_id == curr_user,'date'].tolist()
            dates.sort()
            user_dict[curr_user] = dates
        index = user_dict[curr_user].index(curr_date)
        
        rolling_rev.append(index)
        
    df['reviewsToDate'] = rolling_rev
    return df
        
        

Now the combined dataset will get passed in to the engineered_df function, and a new df will be returned. We then use the ex_id column to split back into train and test

In [30]:
combined = engineered_df(combined)

### Sanity Check

We will pull a user with multiple reviews from both the first 10 and last 10 rows of the transformed dataframe, and check the original combined dataframe to esnure both the # of reviews match and the ordering is correct.

In [31]:
combined.head(10)

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review,reviewsToDate
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...,29
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...,0
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,0
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...,0
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...,1
5,5,928,0,4.0,1,2009-09-02,A solid 4 stars for this greek food spot. If ...,0
6,7,930,0,4.0,1,2007-05-20,Love this place! Try the Chicken sandwich or ...,0
7,8,931,0,4.0,1,2005-12-27,My friend and I were intrigued by the nightly ...,15
8,10,933,0,5.0,1,2014-01-21,pretty cool place...good food...good people,0
9,12,935,0,5.0,1,2011-01-31,Fabulous Authentic Greek Food!!! This little s...,0


In [32]:
print(len(copy[copy['user_id']==923].sort_values(by=['date'])))
copy[copy['user_id']==923].sort_values(by=['date'])

31


Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
202384,289603,923,759,5.0,1,2013-11-04,"The falafel were superb, stuffed grape leaved ..."
122365,174979,923,131,5.0,1,2013-11-11,The food is simply excellent. Everything is as...
166153,237627,923,622,5.0,1,2013-11-19,This place is amazing.We really love good lati...
172282,246460,923,906,5.0,1,2013-11-19,I had Nasi Lemak and Nyonya Seafood Fried Rice...
9044,90566,923,256,5.0,1,2013-12-08,"This is the place to go if u love tofu, kimchi..."
129686,185412,923,505,5.0,1,2014-01-04,I was in the neighborhood with out-of-town gue...
81951,117298,923,675,5.0,1,2014-01-04,I recently ate at Olea again and continue to b...
4180,5992,923,19,5.0,1,2014-01-14,The restaurant is on the ground floor of a typ...
34295,342904,923,919,5.0,1,2014-02-02,Bistango is a casual trattoria that hits all t...
29808,297788,923,778,5.0,1,2014-02-12,"The place is hard to find and out of the way, ..."


In [33]:
combined.tail(10)

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review,reviewsToDate
35908,358813,18011,921,4.0,0,2012-09-30,This review is only for their Happy Hour. 2 f...,12
35909,358827,2230,921,5.0,0,2012-06-08,The cat's out of the bag! Bijan's is no longer...,2
35910,358840,3504,921,5.0,0,2012-01-25,"Ever since Bijan's opened up, happy days are h...",73
35911,358841,45596,921,5.0,0,2012-01-18,This is my new favorite neighborhood bar. I w...,0
35912,358850,161112,349,3.0,1,2014-04-09,"The staff was incredibly nice, the lemonade ta...",0
35913,358855,161115,349,3.0,0,2014-12-15,"Okay, so I gotta repeat the chorus here and st...",0
35914,358859,161116,349,2.0,0,2014-12-10,"The pizza is delicious, but it's SO loud here,...",0
35915,358884,161125,349,5.0,0,2014-09-05,Emily has hands down the best pizza I've had i...,0
35916,358894,1423,349,5.0,0,2014-07-13,I'm not entirely sure who came up with the nam...,17
35917,358935,161140,349,5.0,0,2014-03-04,My date and I had an amazing time here. The s...,0


In [34]:
copy[copy['user_id']==1423].sort_values(by=['date'])

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
183324,262316,1423,676,5.0,0,2012-07-30,Lovely location near Battery Park and the near...
124704,178341,1423,484,5.0,0,2012-07-31,&%@#! There's not a damn thing about this plac...
94940,135856,1423,396,5.0,0,2012-10-18,Unless you're crazy allergic to seafood (or ju...
83236,119109,1423,345,5.0,0,2012-12-21,I have to admit some shame in passing some und...
12577,17873,1423,336,4.0,0,2013-07-19,"Stumbled upon this restaurant after a long, he..."
106664,152607,1423,440,5.0,0,2013-09-03,With the constant debates about the best ramen...
51302,73348,1423,203,4.0,0,2013-10-15,I'm still dreaming of my visit at Little Colli...
188887,270270,1423,702,4.0,0,2013-10-29,"Excellent Thai food in the Cobble Hill area, w..."
92393,132165,1423,628,5.0,0,2013-10-29,I professed my love for a well-known hipster B...
215812,308767,1423,803,3.0,0,2013-11-08,Anella definitely caught my attention with its...


### Splitting Data Back

In [38]:
train = combined[combined['ex_id'].isin(train_ex)]
print(len(train))
train.head()

250874


Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review,reviewsToDate
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...,29
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...,0
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,0
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...,0
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...,1


In [40]:
dev = combined[combined['ex_id'].isin(dev_ex)]
print(len(dev))
dev.head()

35918


Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review,reviewsToDate
0,11,934,0,5.0,1,2014-01-20,"all around good place, cozy, I came in and did...",0
1,17,940,0,4.0,0,2014-09-16,"For lunch, my friend and I had: -Lamb sandwich...",7
2,20,943,0,5.0,0,2014-05-24,Some good Big Greek cooking!! Came to City on ...,0
3,30,953,0,4.0,0,2013-10-17,So... as you may notice from some of my other ...,3
4,43,966,0,3.0,0,2012-12-19,"I don't understand the whole ""You can't order ...",16


### Exporting Transformed Column

In [43]:
import pickle
with open('data/train_reviews_ToDate.pckl', 'wb') as f:
    pickle.dump(train['reviewsToDate'], f)
    
with open('data/dev_reviewsToDate.pckl', 'wb') as f:
    pickle.dump(dev['reviewsToDate'], f)