In [None]:
import pandas as pd
import sqlite3 as sq
import numpy as np
import surprise
from textblob import TextBlob
from numpy import interp

In [None]:
def import_data(db_path):
    conn = sq.connect(db_path) #sqliteDB path goes in parantheses
    crsr = conn.cursor()

    df = pd.read_sql_query('''
                SELECT *
                FROM trunc_books
                ;
                ''', conn)

    df['star_rating'] = df['star_rating'].astype(float)
    df['star_rating'] = df['star_rating'].astype(int) #convert rating to integer type
    df['helpful_votes'] = df['helpful_votes'].astype(int) #convert rating to integer type

    df['review_body'] = df['review_body'].astype(str) #convert to str
    df['review_headline'] = df['review_headline'].astype(str) #convert to str
    return df

In [None]:
def sentiment(row):
    try:
        #raw text of review
        blob = TextBlob(row['review_body'])
        #sentiment polarity score
        polarity = blob.sentiment.polarity
        score = interp(polarity, [-1,1], [1,5])
        return score
    except:
        pass

In [None]:
def clean_sentiment(row):
    try:
        import string
        from nltk.corpus import stopwords

        #split raw text of review into tokens
        words= row['review_body']
        tokens = words.split()

        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]

        #remove tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]

        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]

        #convert this list to a string
        blob = ' '.join(tokens)

        #find sentiment of new string and convert to 1-5 scale
        blob = TextBlob(blob)
        polarity = blob.sentiment.polarity
        score = interp(polarity, [-1,1], [1,5])
        return score
    except:
        pass

In [None]:
path = '/db/wrangled_reviews.db'
df = import_data(path)

In [None]:
df['sentiment_star_rating'] = df.apply(sentiment,axis=1)
df['cleaned_sentiment_star_rating'] = df.apply(clean_sentiment,axis=1)
df['star_rating']=df['star_rating'].astype(float)
df['difference'] = abs(df['star_rating'] - df['cleaned_sentiment_star_rating'])
df['review_length'] = df['review_body'].str.count(' ') + 1
df['headline_length'] = df['review_headline'].str.count(' ') + 1