# Preprocessing all 3 original datasets

## Preprocessing Book Dataset

In [8]:
# Import libraries
import pandas as pd
import numpy as np

In [9]:
# Read book_original.csv

df_book = pd.read_csv("original_datasets/book_original.csv")
df_book.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary
0,0,5,This book was the very first bookmobile book I...,50 + years ago...
1,1,1,"When I read the description for this book, I c...",Boring! Boring! Boring!
2,2,5,I just had to edit this review. This book is a...,Wiggleliscious/new toy ready/!!
3,3,5,I don't normally buy 'mystery' novels because ...,Very good read.
4,4,5,"This isn't the kind of book I normally read, a...",Great Story!


In [10]:
df_book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  12000 non-null  int64 
 1   rating      12000 non-null  int64 
 2   reviewText  12000 non-null  object
 3   summary     11998 non-null  object
dtypes: int64(2), object(2)
memory usage: 375.1+ KB


In [11]:
df_book["rating"].value_counts()

rating
5    3000
4    3000
1    2000
3    2000
2    2000
Name: count, dtype: int64

In [14]:
def map_rating(class_index):
    if class_index in [1, 2]:
        return 'negative'
    elif class_index == 3:
        return 'neutral'
    elif class_index in [4, 5]:
        return 'positive'
    else:
        return 'unknown'

In [20]:
df_book['sentiment'] = df_book['rating'].apply(map_rating)

In [22]:
# Drop Unnamed: 0, rating and summary columns
df_book = df_book.drop(columns=['Unnamed: 0', 'rating', 'summary'])

In [24]:
# Rename reviewText to sentence column
df_book = df_book.rename(columns={'reviewText': 'sentence'})

In [26]:
# Add a new column named 'topic' with value 'book'
df_book['topic'] = 'book'

In [27]:
df_book

Unnamed: 0,sentence,sentiment,topic
0,This book was the very first bookmobile book I...,positive,book
1,"When I read the description for this book, I c...",negative,book
2,I just had to edit this review. This book is a...,positive,book
3,I don't normally buy 'mystery' novels because ...,positive,book
4,"This isn't the kind of book I normally read, a...",positive,book
...,...,...,...
11995,Had to read certain passages twice--typos. Wi...,negative,book
11996,Not what i expected. yet a very interesting bo...,neutral,book
11997,Dragon Knights is a world where Knights ride d...,positive,book
11998,"Since this story is very short, it's hard to s...",positive,book


## Preprocessing Sports Dataset

In [28]:
df_sports = pd.read_csv("original_datasets/sports_original.csv")
df_sports.head()

Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweet,Sentiment
0,0,2022-11-20 23:59:21+00:00,4,Twitter Web App,What are we drinking today @TucanTribe \n@MadB...,neutral
1,1,2022-11-20 23:59:01+00:00,3,Twitter for iPhone,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive
2,2,2022-11-20 23:58:41+00:00,1,Twitter for iPhone,Worth reading while watching #WorldCup2022 htt...,positive
3,3,2022-11-20 23:58:33+00:00,1,Twitter Web App,Golden Maknae shinning bright\n\nhttps://t.co/...,positive
4,4,2022-11-20 23:58:28+00:00,0,Twitter for Android,"If the BBC cares so much about human rights, h...",negative


In [29]:
df_sports["Sentiment"].value_counts()   

Sentiment
positive    8489
neutral     8251
negative    5784
Name: count, dtype: int64

In [31]:
# Drop Unnamed: 0, Date Created, Number of Likes, Source of tweet column
df_sports = df_sports.drop(columns=['Unnamed: 0', 'Date Created', 'Number of Likes', 'Source of Tweet'])

In [32]:
# Rename Sentiment to sentiment and Tweet to sentence
df_sports = df_sports.rename(columns={'Sentiment': 'sentiment', 'Tweet': 'sentence'})

In [35]:
# Add a new column named 'topic' with value 'sports'
df_sports['topic'] = 'sports'

In [36]:
df_sports

Unnamed: 0,sentence,sentiment,topic
0,What are we drinking today @TucanTribe \n@MadB...,neutral,sports
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,sports
2,Worth reading while watching #WorldCup2022 htt...,positive,sports
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,sports
4,"If the BBC cares so much about human rights, h...",negative,sports
...,...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive,sports
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral,sports
22521,Great thread to read before the start of #Worl...,positive,sports
22522,Raphinha wants Brazil to be united at the #Wor...,positive,sports


## Preprocessing Movies Dataset

In [43]:
df_movies = pd.read_csv("original_datasets/movies_original.csv")
df_movies.head()

Unnamed: 0,Review,Rating,Sentiment
0,"Kurt Russell's chameleon-like performance, cou...",10,1
1,It was extremely low budget(it some scenes it ...,8,1
2,James Cagney is best known for his tough chara...,8,1
3,"Following the brilliant ""Goyôkiba"" (aka. ""Hanz...",8,1
4,One of the last classics of the French New Wav...,10,1


In [44]:
df_movies["Rating"].value_counts()

Rating
1     10122
10     9731
8      5859
4      5331
3      4961
7      4803
9      4607
2      4586
Name: count, dtype: int64

In [45]:
# Map ratings to sentiment
def map_rating_movies(class_index):
    if class_index in [1, 2, 3]:
        return 'negative'
    elif class_index == 4:
        return 'neutral'
    elif class_index in [7, 8, 9, 10]:
        return 'positive'
    else:
        return 'unknown'

In [46]:
# Drop Sentiment column and apply map_rating_movies to Rating column
df_movies = df_movies.drop(columns=['Sentiment'])
df_movies['sentiment'] = df_movies['Rating'].apply(map_rating_movies)

In [47]:
# Rename Review to sentence column and drop Rating column
df_movies = df_movies.rename(columns={'Review': 'sentence'})
df_movies = df_movies.drop(columns=['Rating'])

In [48]:
df_movies

Unnamed: 0,sentence,sentiment
0,"Kurt Russell's chameleon-like performance, cou...",positive
1,It was extremely low budget(it some scenes it ...,positive
2,James Cagney is best known for his tough chara...,positive
3,"Following the brilliant ""Goyôkiba"" (aka. ""Hanz...",positive
4,One of the last classics of the French New Wav...,positive
...,...,...
49995,(spoiler) it could be the one the worst movie ...,neutral
49996,"So, you've seen the Romero movies, yes? And yo...",negative
49997,Just listen to the Broadway cast album and to ...,negative
49998,I have been a fan of the Carpenters for a long...,negative


In [51]:
df_movies["sentiment"].value_counts()

sentiment
positive    25000
negative    19669
neutral      5331
Name: count, dtype: int64

Since `df_movies` is highly imbalanced and much larger than `df_book` and `df_sports`, we will downsample `df_movies` to 15k rows where there are 5k rows for each sentiment (`positive`, `neutral`, `negative`).

In [52]:
TARGET = 5_000       
RNG_SEED = 42         # so results are reproducible

balanced_movies = (
    df_movies                       # original DataFrame
      .groupby("sentiment", group_keys=False)      # split by class
      .apply(lambda x: x.sample(n=TARGET,          # draw 5 000 rows
                                random_state=RNG_SEED))
      .reset_index(drop=True)
)

# sanity-check
print(balanced_movies["sentiment"].value_counts())


sentiment
negative    5000
neutral     5000
positive    5000
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=TARGET,          # draw 5 000 rows


In [54]:
balanced_movies

Unnamed: 0,sentence,sentiment
0,"Richard Dreyfuss is, indeed, in this flick, bu...",negative
1,By no means is this movie as bad as 'Perfect S...,negative
2,I watched this movie for the first time the ot...,negative
3,Recipe for one of the worst movies of all time...,negative
4,"Despite the excellent cast, this is an unremar...",negative
...,...,...
14995,This is another fantasy favorite from Ralph Ba...,positive
14996,"Although I love this movie, I can barely watch...",positive
14997,"Anna (Charlotte Burke), who is just on the ver...",positive
14998,"Ordinarily, Anthony Mann made westerns with 't...",positive


In [55]:
# Add a new column named 'topic' with value 'movie'
balanced_movies['topic'] = 'movie'

In [56]:
balanced_movies

Unnamed: 0,sentence,sentiment,topic
0,"Richard Dreyfuss is, indeed, in this flick, bu...",negative,movie
1,By no means is this movie as bad as 'Perfect S...,negative,movie
2,I watched this movie for the first time the ot...,negative,movie
3,Recipe for one of the worst movies of all time...,negative,movie
4,"Despite the excellent cast, this is an unremar...",negative,movie
...,...,...,...
14995,This is another fantasy favorite from Ralph Ba...,positive,movie
14996,"Although I love this movie, I can barely watch...",positive,movie
14997,"Anna (Charlotte Burke), who is just on the ver...",positive,movie
14998,"Ordinarily, Anthony Mann made westerns with 't...",positive,movie


In [57]:
# Concatenate all three DataFrames
df_combined = pd.concat([df_book, df_sports, balanced_movies], ignore_index=True)

In [58]:
df_combined

Unnamed: 0,sentence,sentiment,topic
0,This book was the very first bookmobile book I...,positive,book
1,"When I read the description for this book, I c...",negative,book
2,I just had to edit this review. This book is a...,positive,book
3,I don't normally buy 'mystery' novels because ...,positive,book
4,"This isn't the kind of book I normally read, a...",positive,book
...,...,...,...
49519,This is another fantasy favorite from Ralph Ba...,positive,movie
49520,"Although I love this movie, I can barely watch...",positive,movie
49521,"Anna (Charlotte Burke), who is just on the ver...",positive,movie
49522,"Ordinarily, Anthony Mann made westerns with 't...",positive,movie


In [59]:
df_combined["topic"].value_counts()

topic
sports    22524
movie     15000
book      12000
Name: count, dtype: int64

In [60]:
df_combined["sentiment"].value_counts()

sentiment
positive    19489
neutral     15251
negative    14784
Name: count, dtype: int64

In [61]:
# Save the combined DataFrame to a new CSV file
df_combined.to_csv("train_sets/train.csv", index=False)