# Cleaning Data

### Importing Data

In [115]:
import pandas as pd

data = pd.read_csv('./../data/ratings_beauty.csv')

In [116]:
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 61.7+ MB
None


### Checking and Converting Data Types

In [117]:
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [118]:
print(data.dtypes)

UserId        object
ProductId     object
Rating       float64
Timestamp      int64
dtype: object


In [119]:
data['UserId'] = data['UserId'].astype(str)
data['ProductId'] = data['ProductId'].astype(str)

data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
data['Rating'] = data['Rating'].astype(float)

### Handling Duplicate Rows

In [120]:
data = data.drop_duplicates()

In [121]:
print(f'Number of duplicates: {data.duplicated().sum()}')

Number of duplicates: 0


### Range Validation for Ratings

In [122]:
data = data[(data['Rating'] >= 1) & (data['Rating'] <= 5)]

In [123]:
invalid_ratings = data[(data['Rating'] < 1) | (data['Rating'] > 5)]
print(f'Number of invalid ratings: {len(invalid_ratings)}')

Number of invalid ratings: 0


### Interaction Frequency Analysis

In [124]:
user_interactions = data['UserId'].value_counts()
product_interactions = data['ProductId'].value_counts()
print(user_interactions)
print(product_interactions)

UserId
A3KEZLJ59C1JVH    389
A281NPSIMI1C2R    336
A3M174IC0VXOS2    326
A2V5R832QCSOMX    278
A3LJLRIZL38GG3    276
                 ... 
AFLE9ZCCERY6L       1
A2U2AW7L2BU1S       1
A2KNQZY2DU4H8I      1
A6J6SJ1063P79       1
A3MQDRRGC9070R      1
Name: count, Length: 1210271, dtype: int64
ProductId
B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
              ... 
B004U81OBC       1
B004U7R0EI       1
B004U7Q2O2       1
B004U7NKRE       1
B00LU0LTOU       1
Name: count, Length: 249274, dtype: int64


In [125]:
# Filter users/products with low interactions
threshold = 10
data = data[data['UserId'].isin(user_interactions[user_interactions >= threshold].index)]
data = data[data['ProductId'].isin(product_interactions[product_interactions >= threshold].index)]

### Handle Missing Data

In [126]:
data = data.dropna()

In [127]:
data['UserId'] = data['UserId'].ffill()
data['ProductId'] = data['ProductId'].bfill()
# Filling missing values with median
data['Rating'] = data['Rating'].fillna(data['Rating'].median())

In [128]:
print(data.isna().count())

UserId       167452
ProductId    167452
Rating       167452
Timestamp    167452
dtype: int64


### Exporting Cleaned Data

In [129]:
data.to_csv('./../data/cleaned.csv', index=False)