In [101]:
import numpy as np
import pandas as pd
import nltk 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from textblob import TextBlob
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
# inline %matplotlib
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
cf.go_offline()
import plotly.graph_objs as go 
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')
warnings.warn('this will not show')

pd.set_option('display.max_columns', None)

In [102]:
df = pd.read_csv('amazon.csv')

In [103]:
df = df.sort_values('wilson_lower_bound', ascending=False)
df.drop('Unnamed: 0', inplace=True, axis=1)

In [104]:
def missing_value_analysis(df):
    na_columns = [col for col in df.columns if df[col].isnull().sum()>0]
    n_miss = df[na_columns].isnull().sum().sort_values(ascending = True)
    ratio = (df[na_columns].isnull().sum()/df.shape[0]*100).sort_values(ascending = True)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis = 1, keys=['Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

def check_dataframe(df, head = 5, tail = 5):
     print('SHAPE'.center(82, '~'))
     print('Rows: {}'.format(df.shape[0]))
     print('Columns: {}'.format(df.shape[1]))
     print('TYPES:'.center(82, '~'))
     print(df.dtypes)
     print(''.center(82, '~'))
     print(missing_value_analysis(df))
     print('DUPLICATED VALUES'.center(82, '~'))
     print(df.duplicated().sum())
     print('QUANTILES'.center(82, '~'))
     print(df.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

# check_dataframe(df)


In [105]:
def check_class(dataframe):
    nunique_df = pd.DataFrame({'Variable': dataframe.columns, 'Classes': [dataframe[i].nunique() \
         for i in dataframe.columns]})
    nunique_df = nunique_df.sort_values('Classes', ascending=False)
    nunique_df = nunique_df.reset_index(drop=True)
    return nunique_df

# check_class(df)

In [106]:
constraints = ['#B34D22', '#EBE00C', '#1FEB0C', '#0C92EB', '#EB0CD5']


def categorical_variable_summary(df, column_name):
    fig = make_subplots(rows=1, cols=2, subplot_titles=('Countplot', 'Percentages'),
                        specs=[[{'type': 'xy'}, {'type': 'domain'}]])

    fig.add_trace(go.Bar(y=df[column_name].value_counts().values.tolist(),
                         x=[str(i)
                            for i in df[column_name].value_counts().index],
                         text=df[column_name].value_counts().values.tolist(),
                         textfont=dict(size=14),
                         name=column_name,
                         textposition='auto',
                         showlegend=False,
                         marker=dict(color=constraints,
                                     line=dict(color='#DBE6EC', width=1))),
                  row=1, col=1)

    fig.add_trace(go.Pie(labels=df[column_name].value_counts().keys(), values=df[column_name].value_counts().values,
                         textfont=dict(size=18),
                         textposition='auto',
                         showlegend=False,
                         name=column_name,
                         marker=dict(colors=constraints)),
                  row=1, col=2)
    
    fig.update_layout(title = {'text': column_name, 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
    template = 'plotly_white')

    iplot(fig)


In [107]:
# categorical_variable_summary(df, 'overall')

In [108]:
rt = lambda x: re.sub('[^a-zA-Z]', ' ', str(x))
df['reviewText'] = df['reviewText'].map(rt)
df['reviewText'] = df['reviewText'].str.lower()
# df.head()

In [109]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
df[['polarity', 'subjectivity']] = df['reviewText'].apply(
    lambda Text: pd.Series(TextBlob(Text).sentiment))

for index, row in df['reviewText'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    if neg > pos:
        df.loc[index, 'sentiment'] = 'Negative'
    elif pos > neg:
        df.loc[index, 'sentiment'] = 'Positive'
    else:
        df.loc[index, 'sentiment'] = 'Neutral'

In [110]:
df[df['sentiment']=='Positive'].sort_values('wilson_lower_bound', ascending=False)

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound,polarity,subjectivity,sentiment
2031,"Hyoun Kim ""Faluzure""",5,update so my lovely wife boug...,05-01-2013,702,1952,68,2020,1884,0.966337,0.957544,0.163859,0.562259,Positive
3449,NLee the Engineer,5,i have tested dozens of sdhc and micro sdhc ca...,26-09-2012,803,1428,77,1505,1351,0.948837,0.936519,0.103870,0.516435,Positive
4212,SkincareCEO,1,note please read the last update scroll to ...,08-05-2013,579,1568,126,1694,1442,0.925620,0.912139,0.212251,0.505394,Positive
317,"Amazon Customer ""Kelly""",1,if your card gets hot enough to be painful it...,09-02-2012,1033,422,73,495,349,0.852525,0.818577,0.143519,0.494207,Positive
4672,Twister,5,sandisk announcement of the first gb micro ...,03-07-2014,158,45,4,49,41,0.918367,0.808109,0.172332,0.511282,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4728,Vinay Tannan,5,great micro sd high speed card for digital pho...,15-06-2014,176,0,0,0,0,0.000000,0.000000,0.240000,0.397500,Positive
4726,Vildo H Selgado,5,great for my lg optimus g and super fast trans...,24-01-2014,318,0,0,0,0,0.000000,0.000000,0.458333,0.629167,Positive
4725,"Vikram Rao ""Viks""",5,what you buy this for is in there gb fast ...,27-04-2013,590,0,0,0,0,0.000000,0.000000,0.500000,0.800000,Positive
4723,Victor Santana,5,memory arrived intact delivery was fast and i ...,06-04-2014,246,0,0,0,0,0.000000,0.000000,0.350000,0.612500,Positive


In [111]:
# categorical_variable_summary(df, 'sentiment')

In [112]:
def solve(reviewText):

    data = [reviewText]
    dd = pd.DataFrame(data, columns=['reviewText'])

    def rt(x): return re.sub('[^a-zA-Z]', ' ', str(x))

    dd['reviewText'] = dd['reviewText'].map(rt)
    dd['reviewText'] = dd['reviewText'].str.lower()

    dd[['polarity', 'subjectivity']] = dd['reviewText'].apply(
        lambda Text: pd.Series(TextBlob(Text).sentiment))

    for index, row in dd['reviewText'].iteritems():
        score = SentimentIntensityAnalyzer().polarity_scores(row)
        neg = score['neg']
        neu = score['neu']
        pos = score['pos']
        if neg > pos:
            dd.loc[index, 'sentiment'] = 'Negative'
        elif pos > neg:
            dd.loc[index, 'sentiment'] = 'Positive'
        else:
            dd.loc[index, 'sentiment'] = 'Neutral'

    return dd['sentiment'].iloc[0]
