In [1]:
import pandas as pd
from pandas import concat
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from nltk.tokenize.toktok import ToktokTokenizer
import cleantext
from contractions import CONTRACTION_MAP
from stopwords import STOPWORD_MAP

In [2]:
unprocessed_df = pd.read_csv("data/reviews.csv")

In [3]:
unprocessed_df.head(5)

Unnamed: 0,Id,Score,Text
0,1,5,I have bought several of the Vitality canned d...
1,2,1,Product arrived labeled as Jumbo Salted Peanut...
2,3,4,This is a confection that has been around a fe...
3,4,2,If you are looking for the secret ingredient i...
4,5,5,Great taffy at a great price. There was a wid...


In [4]:
unprocessed_df.describe()

Unnamed: 0,Id,Score
count,568454.0,568454.0
mean,284227.5,4.183199
std,164098.679298,1.310436
min,1.0,1.0
25%,142114.25,4.0
50%,284227.5,5.0
75%,426340.75,5.0
max,568454.0,5.0


In [5]:
unprocessed_df.groupby("Score").count()

Unnamed: 0_level_0,Id,Text
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
1,52268,52268
2,29769,29769
3,42640,42640
4,80655,80655
5,363122,363122


## Convert scores into positive or negative

In [6]:
# delete Id column
processed_df = unprocessed_df.drop(columns = "Id")

# convert score to pos/neg, get rid of 3s
def convert_score(df):
    df = df.drop(df[df['Score']==3].index)
    sentiment_dict = {1:0, 2:0, 4:1, 5:1}
    df["Sentiment"] = df["Score"].map(sentiment_dict)
    df = df.drop(columns = "Score")
    return df

In [7]:
processed_df = convert_score(processed_df)
processed_df.groupby("Sentiment").count()

Unnamed: 0_level_0,Text
Sentiment,Unnamed: 1_level_1
0,82037
1,443777


## Create dataframe with negative and positive reviews

In [8]:
# neg reviews
neg_df = processed_df[processed_df.Sentiment == 0]
neg_df = neg_df.reset_index(drop = True).truncate(before = 1, after = 50000)

# pos reviews
pos_df = processed_df[processed_df.Sentiment == 1]
pos_df = pos_df.reset_index(drop = True).truncate(before = 1, after = 50000)

# combine into one df and randomly shuffle reviews
processed_df = concat([pos_df, neg_df], ignore_index = True)
processed_df = processed_df.sample(frac = 1).reset_index(drop = True)

## Clean text for analysis

In [9]:
def remove_stopwords(text, map = STOPWORD_MAP):
    '''
    Removes stopwords from text
    '''
    word = text.split()
    words_list = [map[word] if word in map else word for word in word]
    filtered_text = ' '.join(words_list)
    return filtered_text

def expand_contractions(text, map = CONTRACTION_MAP):
    '''
    Expands contractions 
    '''
    word = text.split()
    words_list = [map[word] if word in map else word for word in word]
    expanded_text = ' '.join(words_list)
    return expanded_text


def cleaner(df, text):
    '''
    Applies text cleaning to all items in dataframe
    '''
    for i in range(0, len(df)):
        clean_text = cleantext.clean(df.at[i,"Text"], lowercase = True, extra_spaces = True, numbers = True, punct = True, stemming = True)
        clean_text = expand_contractions(clean_text)
        clean_text = remove_stopwords(clean_text)
        text.append(clean_text)
    print(len(text))
                             

In [10]:
processed_text = []

cleaner(processed_df, processed_text)

100000


In [11]:
processed_df["Edited Text"] = processed_text
processed_df = processed_df.drop(columns = ["Text"])
processed_df.head(5)

Unnamed: 0,Sentiment,Edited Text
0,0,realli want good but veri bland tast no...
1,0,alway fan rao marinara sauc shell pr...
2,0,order thi cereal high hope head especi give...
3,0,idea basic good one howev larg papillon al...
4,0,guess either love dont recent read cafe...


## Train and test datasets

In [12]:
# create train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(processed_df["Edited Text"],
                                                   processed_df["Sentiment"],
                                                   test_size = 0.25,
                                                   random_state = 10)

In [13]:
# create TFIDF vector
tfidf = TfidfVectorizer(encoding='utf-8',
                       ngram_range=(1,2),
                       max_df=1.0,
                       min_df=10,
                       max_features=500,
                       norm='l2',
                       sublinear_tf=True)

In [14]:
train_features = tfidf.fit_transform(X_train).toarray()
test_features = tfidf.transform(X_test).toarray()

In [15]:
train_labels = Y_train
test_labels = Y_test

## Save files for analysis

In [16]:
with open("data/processed_df.pickle", "wb") as data:
    pickle.dump(processed_df, data)

with open("data/X_train.pickle", "wb") as data:
    pickle.dump(X_train, data)
    
with open("data/Y_train.pickle", "wb") as data:
    pickle.dump(Y_train, data)
    
with open("data/train_features.pickle", "wb") as data:
    pickle.dump(train_features, data)
    
with open("data/train_labels.pickle", "wb") as data:
    pickle.dump(train_labels, data)

with open("data/X_test.pickle", "wb") as data:
    pickle.dump(X_test, data)

with open("data/Y_test.pickle", "wb") as data:
    pickle.dump(Y_test, data)
    
with open("data/test_features.pickle", "wb") as data:
    pickle.dump(test_features, data)

with open("data/test_labels.pickle", "wb") as data:
    pickle.dump(test_labels, data)
    
with open("data/tfidf.pickle", "wb") as data:
    pickle.dump(tfidf, data)