In [1]:
import pandas as pd
import sqlite3 as sq
import surprise

In [2]:
#Set up data
path = '/Users/Dustin/GT_project/wrangled_reviews.db'
def import_data(db_path):
    conn = sq.connect(db_path) #sqliteDB path goes in parantheses
    crsr = conn.cursor()

    df = pd.read_sql_query('''
                SELECT *
                FROM trunc_books
                ;
                ''', conn)

    df['star_rating'] = df['star_rating'].astype(float)
    df['star_rating'] = df['star_rating'].astype(int) #convert rating to integer type
    df['helpful_votes'] = df['helpful_votes'].astype(int) #convert rating to integer type

    df['review_body'] = df['review_body'].astype(str) #convert to str
    df['review_headline'] = df['review_headline'].astype(str) #convert to str
    

    return df

df = import_data(path)

In [3]:
df.head(5)
#len(df)
#df.dtypes

Unnamed: 0,customer_id,product_id,product_parent,star_rating,helpful_votes,review_headline,review_body,product_category,marketplace,review_date,verified_purchase
0,40676812,1938067126,402004849,5,0,Five Stars,Excellent--,Books,US,2015-08-31,Y
1,2784618,014017737X,779170984,5,0,Five Stars,"Arrived before estimated delivery date, just a...",Books,US,2015-08-31,Y
2,2876528,0982207743,225126623,3,1,Three Stars,"Recipes are not complicated, but ingredients a...",Books,US,2015-08-31,Y
3,33678379,080072433X,42136245,3,0,Hard to Get Into,This book I actually had a hard time with. I p...,Books,US,2015-08-31,N
4,32159651,0615815650,625464646,3,1,Run of the mill,I've read quite a few books about persuasion p...,Books,US,2015-08-31,Y


In [4]:
def count_words(data):
   words = data.split(" ")
   num_words = len(words)
   return num_words

df['review_word_count']=0
df['review_hl_count']=0

df['review_word_count'] = df['review_body'].map(count_words)
df['review_hl_count'] = df['review_headline'].map(count_words)


Add in Sentiment Analysis

In [5]:
from textblob import TextBlob
from numpy import interp

In [6]:
def sentiment(row):
    try:
        #raw text of review
        blob = TextBlob(row['review_body'])
        #sentiment polarity score
        polarity = blob.sentiment.polarity
        score = interp(polarity, [-1,1], [1,5])
        return score
    except:
        pass

In [7]:
def clean_sentiment(row):
    try:
        import string
        from nltk.corpus import stopwords

        #split raw text of review into tokens
        words= row['review_body']
        tokens = words.split()

        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]

        #remove tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]

        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]

        #convert this list to a string
        blob = ' '.join(tokens)

        #find sentiment of new string and convert to 1-5 scale
        blob = TextBlob(blob)
        polarity = blob.sentiment.polarity
        score = interp(polarity, [-1,1], [1,5])
        return score
    except:
        pass

In [8]:
df['sentiment_star_rating'] = df.apply(sentiment,axis=1)
#df['cleaned_sentiment_star_rating'] = df.apply(clean_sentiment,axis=1)
#df['star_rating']=df['star_rating'].astype(float)
#df['difference'] = abs(df['star_rating'] - df['cleaned_sentiment_star_rating'])


In [9]:
df.head(5)

Unnamed: 0,customer_id,product_id,product_parent,star_rating,helpful_votes,review_headline,review_body,product_category,marketplace,review_date,verified_purchase,review_word_count,review_hl_count,sentiment_star_rating
0,40676812,1938067126,402004849,5,0,Five Stars,Excellent--,Books,US,2015-08-31,Y,1,2,5.0
1,2784618,014017737X,779170984,5,0,Five Stars,"Arrived before estimated delivery date, just a...",Books,US,2015-08-31,Y,8,2,3.0
2,2876528,0982207743,225126623,3,1,Three Stars,"Recipes are not complicated, but ingredients a...",Books,US,2015-08-31,Y,10,2,3.5
3,33678379,080072433X,42136245,3,0,Hard to Get Into,This book I actually had a hard time with. I p...,Books,US,2015-08-31,N,277,4,3.076869
4,32159651,0615815650,625464646,3,1,Run of the mill,I've read quite a few books about persuasion p...,Books,US,2015-08-31,Y,22,4,2.624242


In [10]:
len(df)

268649

In [None]:
X_dat = df[['product_parent','star_rating','helpful_votes','review_word_count','review_hl_count','sentiment_star_rating']]



# YellowBrick Viz


In [None]:
from sklearn import datasets
from yellowbrick.target import FeatureCorrelation

# Load the regression data set
X, y = data['product_parent','helpful_votes','review_word_count','review_hl_count','sentiment_star_rating']
, data['star_rating']
feature_names = np.array(df['feature_names'])

visualizer = FeatureCorrelation(labels=feature_names)
visualizer.fit(X, y)
visualizer.poof()

In [None]:
from yellowbrick.features import Rank2D
%matplotlib inline

visualizer = Rank2D(algorithm="pearson")
visualizer.fit_transform(X_dat)
visualizer.poof()

In [None]:
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature='star_rating', target='sentiment_star_rating')
visualizer.fit(X_dat['star_rating'], X_dat['sentiment_star_rating'])
visualizer.poof()

In [None]:
from sklearn.cluster import MiniBatchKMeans

from yellowbrick.cluster import KElbowVisualizer

# Instantiate the clustering model and visualizer
visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4,12))

visualizer.fit(X_dat) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data

In [None]:
from sklearn.cluster import MiniBatchKMeans

from yellowbrick.cluster import SilhouetteVisualizer

# Instantiate the clustering model and visualizer
model = MiniBatchKMeans(7)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_dat) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data

# Modeling in Sci-Kit Learn

In [26]:
from scipy.sparse import csr_matrix
df_pivot = df.pivot_table(index='customer_id',columns='product_id',values='star_rating',fill_value=0)
X = df_pivot.T

In [29]:
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(33702, 12)

In [None]:
import numpy as np
corr = np.corrcoef(matrix)
corr.shape

# Modeling in Suprise

In [None]:
from surprise import Reader, Dataset

# to load dataset from pandas df, we need `load_fromm_df` method in surprise lib

ratings_dict = {'itemID': list(df.product_id),
                'userID': list(df.customer_id),
                'rating': list(df.star_rating)}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is required.
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)


In [None]:
###Using Suprise package
# Split data into 5 folds

data.split(n_folds=5)

from surprise import SVD, evaluate
from surprise import NMF, model_selection

# svd
algo = SVD()
model_selection.cross_validate(algo, data, measures=['RMSE'])




In [None]:
# nmf
algo = NMF()
nmf_mod = model_selection.cross_validate(algo, data, measures=['RMSE'])
nmf_mod.fit()

In [None]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.user_id == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movie_id', right_on = 'movie_id').
                     sort_values(['rating'], ascending=False)
                 )
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',
               right_on = 'movie_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations