## Extract Transform and Load the Data set.

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

reviews_df = pd.read_csv("Reviews.csv")

#Only keep reviews with a non-zero helpfulness rating.
reviews_df = reviews_df[reviews_df["HelpfulnessNumerator"] > 0]

#Compute the length of each review and delete all reviews that are longer than 512 characters.
reviews_df['ReviewLength'] = list(map(len, reviews_df['Text']))
reviews_df = reviews_df[reviews_df["ReviewLength"] <= 512]

#Drop all unnecessary columns.
reviews_df.drop(columns=["ProductId", "ProfileName", "HelpfulnessDenominator", "Time", "Summary"], inplace=True)

#Determine how many reviews correspond to each UserId.
review_counts = reviews_df["UserId"].value_counts()
review_count_df = review_counts.to_frame()
review_count_df.rename(columns={"count":"ReviewCount"}, inplace=True)

#Only keep reviews by users with atleast 20 reviews.
reviews_df.set_index("UserId", inplace=True)
reviews_df = reviews_df.join(review_count_df, how="outer")
reviews_df = reviews_df[reviews_df["ReviewCount"] >=20]

print(reviews_df.shape)
reviews_df.head()

## Sentiment Analysis

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax

#Roberta Sentiment Analysis Model.
sentiment_analysis_pipeline = pipeline('sentiment-analysis')

#Determine the sentiment label and score for each review.
sentiment_objects = list(map(sentiment_analysis_pipeline, reviews_df['Text']))
sentiment_labels = [obj[0]['label'] for obj in sentiment_objects]
sentiment_confidence = [obj[0]['score'] for obj in sentiment_objects]

#Append the sentiment scores and labels to the dataframe.
reviews_df["SentimentLabels"] = sentiment_labels
reviews_df["SentimentConfidence"] = sentiment_confidence

#Save the new dataframe as a csv file.
reviews_df.to_csv("transformed_reviews.csv", header=True, index=True)

reviews_df.head()

## Data Visualization

In [None]:
data_df = pd.read_csv("transformed_reviews.csv")
plt.hist(data_df["SentimentLabels"])

**Random Sample of 50 Positive and 50 Negative Reviews for T-test**


In [None]:
# pos_reviews = reviews_df["Score"] >= 4 # 4 stars or more is a positive review
# reviews_pos = reviews_df[pos_reviews]
# neg_reviews = reviews_df["Score"] <= 2 # 2 stars and below is a negative review
# reviews_neg = reviews_df[neg_reviews]

# reviews_pos.drop(columns=["Text", "ReviewLength"], inplace=True)
# reviews_neg.drop(columns=["Text", "ReviewLength"], inplace=True)

# print(reviews_pos.shape)
# reviews_pos.sample(50)

In [None]:
# print(reviews_neg.shape)
# reviews_neg.sample(50)

## Analysis 2: Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Create random lists to use for prediction dataframe
random_score = np.random.randint(1, 6, size=50)
random_review_length = np.random.randint(71, 512, size=50)
random_pos_neg = np.random.randint(2, size=50)

# Create dataframe to use for prediction
unlabelled_data = {'Score': random_score, 'ReviewLength': random_review_length, 'SentimentLabelsBoolean': random_pos_neg}
unlabelled_df = pd.DataFrame(data=unlabelled_data)

# Create new column to store boolean for POSITIVE (1) or NEGATIVE (0) SentimentLabels 
data_df['SentimentLabelsBoolean'] = np.where(data_df['SentimentLabels'] == 'POSITIVE', 1, 0)

X = data_df[['Score', 'ReviewLength', 'SentimentLabelsBoolean']].values
X_ul = unlabelled_df.values
y = data_df['HelpfulnessNumerator'].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# Use RandomForestClassifier 
randomf_model = make_pipeline(StandardScaler(), RandomForestClassifier());
randomf_model.fit(X_train, y_train);
print(randomf_model.score(X_valid, y_valid))
predictions = randomf_model.predict(X_ul)
unlabelled_df = unlabelled_df.merge(pd.Series(predictions, name='HelpfulnessPrediction'), left_index=True, right_index=True)
print(unlabelled_df)