In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

reviews = pd.read_csv('reviews.csv')

print(reviews.columns)
print(reviews.info())

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB
None


Transform the recommended column into binary

In [53]:
print(reviews.recommended.value_counts())

binary_dict = {True:1, False:0}

reviews['recommended'] = reviews['recommended'].map(binary_dict)

print(reviews.recommended.value_counts())


recommended
True     4166
False     834
Name: count, dtype: int64
recommended
1    4166
0     834
Name: count, dtype: int64


Transform the rating feature into numerical scores

In [54]:
print(reviews.rating.value_counts())

rating_dict = {'Loved it':5, 'Liked it':4, 'Was okay':3, 'Not great':2, 'Hated it':1}

reviews['rating'] = reviews['rating'].map(rating_dict)

print(reviews.rating.value_counts())


rating
Loved it     2798
Liked it     1141
Was okay      564
Not great     304
Hated it      193
Name: count, dtype: int64
rating
5    2798
4    1141
3     564
2     304
1     193
Name: count, dtype: int64


Transform the department_name feature

One hot encode the values of department name to individual boolean values

Join these to the reviews table

In [55]:
print(reviews.department_name.value_counts())

one_hot = pd.get_dummies(reviews['department_name'])

print(one_hot)

print(reviews.columns)

reviews = reviews.join(one_hot)

print(reviews.columns)

department_name
Tops        2196
Dresses     1322
Bottoms      848
Intimate     378
Jackets      224
Trend         28
Name: count, dtype: int64
      Bottoms  Dresses  Intimate  Jackets   Tops  Trend
0       False     True     False    False  False  False
1       False     True     False    False  False  False
2       False    False      True    False  False  False
3       False     True     False    False  False  False
4       False     True     False    False  False  False
...       ...      ...       ...      ...    ...    ...
4995    False    False     False    False   True  False
4996    False    False     False    False   True  False
4997    False     True     False    False  False  False
4998     True    False     False    False  False  False
4999    False    False     False    False   True  False

[5000 rows x 6 columns]
Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='ob

Transform the review data feature

In [56]:
print(reviews.review_date.dtype)

reviews['review_date'] = pd.to_datetime(reviews['review_date'])

print(reviews.review_date.dtype)

object
datetime64[ns]


In [57]:
scaled_reviews_dataframe = reviews[['clothing_id', 'recommended', 'rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]

scaled_reviews_dataframe = scaled_reviews_dataframe.set_index('clothing_id')

print(scaled_reviews_dataframe)

             recommended  rating  Bottoms  Dresses  Intimate  Jackets   Tops  \
clothing_id                                                                    
1095                   1       4    False     True     False    False  False   
1095                   1       5    False     True     False    False  False   
699                    1       5    False    False      True    False  False   
1072                   1       5    False     True     False    False  False   
1094                   1       5    False     True     False    False  False   
...                  ...     ...      ...      ...       ...      ...    ...   
918                    1       5    False    False     False    False   True   
950                    0       1    False    False     False    False   True   
1086                   1       5    False     True     False    False  False   
1033                   1       5     True    False     False    False  False   
850                    1       5    Fals

Fit the data

The fit_transform() method is used to fit the data into a model and transform it into a form that is more suitable for the model in a single step

Data standardization is the process of rescaling the attributes so that they have mean as 0 and variance as 1.

In [61]:
scaler = StandardScaler()
scaled_values = scaler.fit_transform(scaled_reviews_dataframe)

print("Mean is 1: ", np.mean(scaled_values))
print("Std dev is 1: ", np.std(scaled_values))

Mean is 1:  3.739231146937527e-17
Std dev is 1:  1.0
