### Summary:

In this notebook, we generate features using the review text as well as attributes durived from user and product data. For tokenization of reviews, please see "Review Processing.ipynb".  The exploratory analysis that underlies some of the features in this notebook can be found in "EDA.ipynb".

In [1]:
import os

# Packages for Data Manipulation
import pandas as pd
import numpy as np
from scipy import sparse

# Packages for Text Processing
from sklearn.feature_extraction import text as tx
import spacy
import enchant


In [2]:
# Use Enchant English Diction
d = enchant.Dict("en_US")


### Import and review fields

In [28]:
df_trn = pd.read_csv(os.path.join('data', 'train.csv'), parse_dates=['date'], index_col=['ex_id'])

In [29]:
df_val = pd.read_csv(os.path.join('data', 'dev.csv'), parse_dates=['date'], index_col=['ex_id'])

In [30]:
df_trn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250874 entries, 0 to 358956
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   user_id  250874 non-null  int64         
 1   prod_id  250874 non-null  int64         
 2   rating   250874 non-null  float64       
 3   label    250874 non-null  int64         
 4   date     250874 non-null  datetime64[ns]
 5   review   250874 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 13.4+ MB


In [31]:
df_trn.describe()

Unnamed: 0,user_id,prod_id,rating,label
count,250874.0,250874.0,250874.0,250874.0
mean,53970.730446,459.778211,4.023717,0.102916
std,45803.665418,259.854178,1.056995,0.30385
min,923.0,0.0,1.0,0.0
25%,13820.0,247.0,4.0,0.0
50%,40485.5,468.0,4.0,0.0
75%,87298.0,672.0,5.0,0.0
max,161147.0,922.0,5.0,1.0


### Construct features based-on user and product characteristics

In [48]:
def user_prod_features(df):
    
    
    # setup blank dataframe for storing meta data
    user_meta = pd.DataFrame(index=pd.Index(df['user_id'].unique(), name='user_id'))
    prod_meta = pd.DataFrame(index=pd.Index(df['prod_id'].unique(), name='prod_id'))         

    # append features related to user and/or producer
    user_meta['user_n_reviews'] = df['user_id'].value_counts()
    prod_meta['prod_n_reviews'] = df['prod_id'].value_counts()
    user_meta['user_avg_rating'] = df.set_index('user_id').groupby(level=0)['rating'].mean()
    prod_meta['prod_avg_rating'] = df.set_index('prod_id').groupby(level=0)['rating'].mean()
    user_meta['user_first_date'] = df.set_index('user_id').groupby(level=0)['date'].min()
    prod_meta['prod_first_date'] = df.set_index('prod_id').groupby(level=0)['date'].min()
    user_meta['user_last_date'] = df.set_index('user_id').groupby(level=0)['date'].max()
    prod_meta['prod_last_date'] = df.set_index('prod_id').groupby(level=0)['date'].max()  
      
    user_meta['user_review_intensity'] = user_meta['user_n_reviews'] / (1+(user_meta['user_last_date'] - user_meta['user_first_date'])/pd.Timedelta('1d'))
    prod_meta['prod_review_intensity'] = prod_meta['prod_n_reviews'] / (1+(prod_meta['prod_last_date'] - prod_meta['prod_first_date'])/pd.Timedelta('1d'))
    
    # join user and prod-based features back to examples
    features = pd.DataFrame(df.loc[:, ['user_id', 'prod_id']]).reset_index()
    features = features.merge(user_meta.loc[:, ['user_n_reviews',
                                                'user_avg_rating', 
                                                'user_review_intensity']].reset_index(), on='user_id', how='left')
    features = features.merge(prod_meta.loc[:, ['prod_n_reviews',
                                                'prod_avg_rating', 
                                                'prod_review_intensity']].reset_index(), on='prod_id', how='left')
    features.set_index('ex_id', inplace=True)
    return features, prod_meta, user_meta
    

In [49]:
up_feats_trn, prod_meta, user_meta = user_prod_features(df_trn)

In [50]:
# confirm perfect coverage of all metrics
up_feats_trn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250874 entries, 0 to 358956
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                250874 non-null  int64  
 1   prod_id                250874 non-null  int64  
 2   user_n_reviews         250874 non-null  int64  
 3   user_avg_rating        250874 non-null  float64
 4   user_review_intensity  250874 non-null  float64
 5   prod_n_reviews         250874 non-null  int64  
 6   prod_avg_rating        250874 non-null  float64
 7   prod_review_intensity  250874 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 17.2 MB


### Generate Additional Features on Training Set

#### Summary Features Derived from Text

In [None]:
# Train count vectorizer on training data
count_trans = tx.CountVectorizer(strip_accents='unicode').fit(df_trn['review'])
count_vect = count_trans.transform(df_trn['review'])

In [34]:
# Total number of words in a review
df_trn['word_count'] = count_vect.sum(axis=1)


In [35]:
# pct of english words in review
en_words = [z for z in count_trans.vocabulary_.keys() if d.check(z)]
en_idx = [count_trans.vocabulary_[z] for z in en_words]


In [36]:
df_trn['pct_en'] = 100*count_vect[:, en_idx].sum(axis=1)/count_vect.sum(axis=1)
df_trn['pct_en'] = df_trn['pct_en'].fillna(0)


invalid value encountered in true_divide



In [14]:
df_trn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250874 entries, 0 to 358956
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   user_id     250874 non-null  int64         
 1   prod_id     250874 non-null  int64         
 2   rating      250874 non-null  float64       
 3   label       250874 non-null  int64         
 4   date        250874 non-null  datetime64[ns]
 5   review      250874 non-null  object        
 6   word_count  250874 non-null  int64         
 7   pct_en      250874 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(4), object(1)
memory usage: 17.2+ MB


#### Combine all features

In [None]:
df_train = up_feats_trn.drop(columns=['user_id', 'prod_id']).join(df_trn.loc[:, ['word_count', 'pct_en']])

In [None]:
df_train.info()

In [None]:
# Combine and output features
df_train.join(df_trn['label']).to_csv('df_train.csv')

### Generate Additional Features on Validation Set

#### Summary Features Derived from Text

In [55]:
count_trans = tx.CountVectorizer(strip_accents='unicode').fit(df_val['review'])
count_vect = count_trans.transform(df_val['review'])

In [58]:
df_val['word_count'] = count_vect.sum(axis=1)

In [59]:
# pct of english words in review
en_words = [z for z in count_trans.vocabulary_.keys() if d.check(z)]
en_idx = [count_trans.vocabulary_[z] for z in en_words]

In [60]:
df_val['pct_en'] = 100*count_vect[:, en_idx].sum(axis=1)/count_vect.sum(axis=1)
df_val['pct_en'] = df_val['pct_en'].fillna(0)


invalid value encountered in true_divide



#### Combine All Features

In [None]:
up_feats_val, prod_meta, user_meta = user_prod_features(df_val)

In [61]:
df_valid = up_feats_val.drop(columns=['user_id', 'prod_id']).join(df_val.loc[:, ['word_count', 'pct_en']])

In [62]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35918 entries, 11 to 358935
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_n_reviews         35918 non-null  int64  
 1   user_avg_rating        35918 non-null  float64
 2   user_review_intensity  35918 non-null  float64
 3   prod_n_reviews         35918 non-null  int64  
 4   prod_avg_rating        35918 non-null  float64
 5   prod_review_intensity  35918 non-null  float64
 6   word_count             35918 non-null  int64  
 7   pct_en                 35918 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 3.7 MB


In [64]:
df_valid.join(df_val['label']).to_csv('df_valid.csv')