In [42]:
import pandas as pd
import numpy as np

# <b> Preparing Data for Sentiment Analysis </b>

### <b> Importing cleaned csv file </b>

In [43]:
df = pd.read_csv('reviews_cleaned.csv')

### <b> Handle Nan values in ratings (if not handled in cleaning) </b>

In [44]:
ratings = df['rating'].values
mean_rating = np.mean(ratings)
ratings = np.where(np.isnan(ratings), mean_rating, ratings)
df['rating'] = ratings

### <b> Handle outlying rating values: </b>

In [45]:
valid_ratings = ratings[(ratings >= 0) & (ratings <= 5)]
df = df[df['rating'].isin(valid_ratings)]

### <b> Calculate the lengths of each review </b>

In [46]:
review_lengths = np.array([len(review.split())] for review in df['content'])

### <b> Normalizing the Data </b>

In [47]:
min_rating = np.min(ratings)
max_rating = np.max(ratings)
normalized_ratings = (ratings - min_rating) / (max_rating - min_rating)
df['normalized_ratings'] = normalized_ratings

### <b> Ensure Consistent Data Formatting in Ratings </b>

In [48]:
df['rating'] = df['rating'].astype(np.float64)

### <b> Save the cleaned data </b>

In [49]:
df.to_csv('reviews_prepped.csv')

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          625 non-null    int64  
 1   review_id           625 non-null    object 
 2   product_id          625 non-null    object 
 3   title               625 non-null    object 
 4   author              625 non-null    object 
 5   rating              625 non-null    float64
 6   content             625 non-null    object 
 7   timestamp           625 non-null    object 
 8   profile_id          625 non-null    object 
 9   is_verified         625 non-null    int64  
 10  helpful_count       625 non-null    int64  
 11  product_attributes  625 non-null    object 
 12  product_name        625 non-null    object 
 13  normalized_ratings  625 non-null    float64
dtypes: float64(2), int64(3), object(9)
memory usage: 68.5+ KB
