# Feature Engineering

### Importing Data

In [19]:
import pandas as pd

data = pd.read_csv('./../data/cleaned.csv', parse_dates=['Timestamp'])

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167452 entries, 0 to 167451
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   UserId     167452 non-null  object        
 1   ProductId  167452 non-null  object        
 2   Rating     167452 non-null  float64       
 3   Timestamp  167452 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 5.1+ MB


### Add Interaction Counts

In [21]:
# Calculate user and product interaction counts
user_interactions = data['UserId'].value_counts()
data['user_interactions'] = data['UserId'].map(user_interactions)
product_interactions = data['ProductId'].value_counts()
data['product_interactions']  = data['ProductId'].map(product_interactions)
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user_interactions,product_interactions
0,A274NIJWOQWE30,1304351475,5.0,2013-11-24,37,1
1,A13IIHXY0QX4Y4,535795531X,3.0,2012-12-27,10,1
2,A39GFZUNMWJ44J,5357955948,5.0,2011-04-20,11,1
3,A60XNB876KYML,7806397051,3.0,2014-04-18,7,4
4,A3G6XNM240RMWA,7806397051,4.0,2013-09-06,10,4


### Create Recency Feature

In [22]:
latest_timestamp = data['Timestamp'].max()
#measure how resent each interaction is by calculating days since the last interaction
data['Recency'] = (latest_timestamp - data['Timestamp']).dt.days
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user_interactions,product_interactions,Recency
0,A274NIJWOQWE30,1304351475,5.0,2013-11-24,37,1,241
1,A13IIHXY0QX4Y4,535795531X,3.0,2012-12-27,10,1,573
2,A39GFZUNMWJ44J,5357955948,5.0,2011-04-20,11,1,1190
3,A60XNB876KYML,7806397051,3.0,2014-04-18,7,4,96
4,A3G6XNM240RMWA,7806397051,4.0,2013-09-06,10,4,320


### Aggregate User Preferences

In [23]:
#by aggregating user ratings, we can better understand user preferences
user_avg_rating = data.groupby('UserId')['Rating'].mean().rename('User_Avg_Rating')
data = data.merge(user_avg_rating, on='UserId')
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user_interactions,product_interactions,Recency,User_Avg_Rating
0,A274NIJWOQWE30,1304351475,5.0,2013-11-24,37,1,241,4.297297
1,A13IIHXY0QX4Y4,535795531X,3.0,2012-12-27,10,1,573,4.6
2,A39GFZUNMWJ44J,5357955948,5.0,2011-04-20,11,1,1190,4.636364
3,A60XNB876KYML,7806397051,3.0,2014-04-18,7,4,96,4.428571
4,A3G6XNM240RMWA,7806397051,4.0,2013-09-06,10,4,320,4.4


### Normalize Interaction Counts

In [24]:
#scale interaction counts using MinMaxScaler for better model input
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[['user_interactions', 'product_interactions']] = scaler.fit_transform(data[['user_interactions', 'product_interactions']])
# how scale is calculated -> (x - min) / (max - min)
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user_interactions,product_interactions,Recency,User_Avg_Rating
0,A274NIJWOQWE30,1304351475,5.0,2013-11-24,0.152542,0.0,241,4.297297
1,A13IIHXY0QX4Y4,535795531X,3.0,2012-12-27,0.038136,0.0,573,4.6
2,A39GFZUNMWJ44J,5357955948,5.0,2011-04-20,0.042373,0.0,1190,4.636364
3,A60XNB876KYML,7806397051,3.0,2014-04-18,0.025424,0.0125,96,4.428571
4,A3G6XNM240RMWA,7806397051,4.0,2013-09-06,0.038136,0.0125,320,4.4


### Encode UserId and ProductId

In [25]:
#encode categorical ID's into numerical values for model training
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

data['UserId'] = user_encoder.fit_transform(data['UserId'])
data['ProductId'] = product_encoder.fit_transform(data['ProductId'])

data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user_interactions,product_interactions,Recency,User_Avg_Rating
0,4017,0,5.0,2013-11-24,0.152542,0.0,241,4.297297
1,337,1,3.0,2012-12-27,0.038136,0.0,573,4.6
2,7597,2,5.0,2011-04-20,0.042373,0.0,1190,4.636364
3,9824,3,3.0,2014-04-18,0.025424,0.0125,96,4.428571
4,8203,3,4.0,2013-09-06,0.038136,0.0125,320,4.4


### Spliting Dataset

In [26]:
# splitting into training and testing subsets
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, train_size=0.5, random_state=42)
print(f'Training data size-> {len(train_data)}, Testing data size-> {len(test_data)}')

Training data size-> 83726, Testing data size-> 33491


In [27]:
# inspecting data splits
print(train_data.head())
print(test_data.head())

       UserId  ProductId  Rating  Timestamp  user_interactions  \
89466    1880      17471     5.0 2014-05-27           0.029661   
68974    8943      13578     1.0 2012-10-14           0.029661   
60364    6579      12061     5.0 2012-07-07           0.029661   
50673    4547      10219     5.0 2014-01-10           0.165254   
23443    1720       4707     5.0 2013-04-16           0.046610   

       product_interactions  Recency  User_Avg_Rating  
89466              0.195833       57            4.625  
68974              0.045833      647            3.750  
60364              0.087500      746            4.250  
50673              0.016667      194            4.425  
23443              0.100000      463            3.000  
        UserId  ProductId  Rating  Timestamp  user_interactions  \
150989   11653      27207     5.0 2013-09-05           0.156780   
41874     8987       8419     5.0 2010-01-22           0.063559   
84900     6151      16552     1.0 2014-07-06           0.033898   

### Exporting Data

In [28]:
data.to_csv('./../data/feature_engineered.csv', index=False)
train_data.to_csv('./../data/train_data.csv', index=False)
test_data.to_csv('./../data/test_data.csv', index=False)
