In [2]:
import pandas as pd
import numpy as np
from json import loads
import pandas as pd
import psycopg2 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [3]:
#Extract the features from DB
conn = psycopg2.connect("dbname='SQLBook' user='postgres' host='localhost' password='postgres'")
cur = conn.cursor()
cur.execute("SELECT r.reviewID,r.asin, cast(r.overall as int) as rating, cast(trim(leading '[' from substring(helpful from 0 for position(',' in helpful))) as int) as votesForReview,cast(trim(trailing ']' from substring(helpful from position(',' in helpful)+2)) as int) as outOf,reviewCount.cnt as reviewerCount,bookCount.cnt as bookreviewCount,EXTRACT(day from current_date - r.reviewTime) age,pr.nodeID,reviewtext,summary FROM reviews r LEFT OUTER JOIN (SELECT reviewID, count(*) cnt FROM reviews GROUP BY reviewID) reviewCount ON r.reviewID = reviewCount.reviewID LEFT OUTER JOIN (SELECT asin, count(*) cnt FROM reviews GROUP BY asin) bookCount ON r.asin = bookCount.asin LEFT OUTER JOIN Products pr ON r.asin = pr.asin")
rows = cur.fetchall()

df = pd.DataFrame(rows,columns=['reviewID','asin','rating','helpfulness','outOf','reviewerCount','bokReviewCount','reviewAge','nodeID','reviewtext','summary'])

#categories: dataframe that contains categories information from Asterixdb (nodeId and 5 levels of categories)
#categories.to_csv('categories.csv')

#reviewFeatures: dataframe that contains review features from Postgres
#reviewFeatures.to_csv('reviewFeatures.csv')

In [4]:
#Combining summary and review text
df['txt']=df['reviewtext']+df['summary']
df.drop(['reviewtext','summary'],axis=1,inplace=True)

In [5]:
#Get review length
df['reviewlen']=0

#Function to get review length
def rev_len(x):
    #print (x)
    x['reviewlen']=len(str(x['txt']))
    return x

df=df.apply(lambda x: rev_len(x),axis=1)

In [6]:
#Get number of sentences
def count_sentences(x):
    cnt=0
    num_exc=0
    for i in str(x):
        if i in ['.','?','!']:
            cnt=cnt+1
    for i in str(x):
        if i in ['?','!']:
            num_exc=num_exc+1
    return cnt,num_exc            

In [7]:
#Get average word length
def avg_word_len(x):
    word_list = str(x).split(' ')
    nwords = len(word_list)
    tot_len=0
    for w in word_list:
        tot_len=tot_len+len(w)
    return tot_len/nwords

In [8]:
#Get # of words, word length, # of sentences, ARI index
df['numwords']=0
df['avgwordlen']=0
df['num_sen']=0
df['num_exc']=0
df['ARI']=0
def numwords(x):
    x['numwords']=len(str(x['txt']).split(' '))
    x['avgwordlen'] = avg_word_len(x['txt'])
    x['num_sen'],x['num_exc'] = count_sentences(x['txt'])
    x['ARI'] = 4.71*(len(str(x['txt']))/float(x['numwords']+1)) + 0.5*(x['numwords']/float(x['num_sen']+1)) - 21.43
    return x

df=df.apply(lambda x: numwords(x),axis=1)

In [9]:
#Extract just the feature set from original df
new_df = df[['rating','helpfulness','outOf','reviewerCount','bokReviewCount','reviewAge','reviewlen','numwords','avgwordlen','num_sen','num_exc','ARI']]

In [10]:
#Training data is the reviews with more than 1 outof
train_df=new_df[new_df['outOf']!=0]

In [11]:
#Prediction data is the reviews which dont have any outof
test_df=new_df[new_df['outOf']==0]

In [12]:
#Get the training labels
y_train = train_df['helpfulness']/train_df['outOf']
#Drop the label data from train
train_df.drop(['helpfulness','outOf'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [13]:
#Split the data into train, validate, test
X_train, X_test, y_train, y_test = train_test_split(train_df, y_train, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=23)

In [14]:
#Normalize the features
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train_n = scaler.transform(X_train)
X_test_n = scaler.transform(X_test)
X_val_n = scaler.transform(X_val)

In [15]:
# Fit linear regression
reg = LinearRegression()
reg.fit(X_train_n,y_train.values)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
# Predict the output for train and validate
y_train_pred = reg.predict(X_train_n)
y_val_pred=reg.predict(X_val_n)

In [17]:
#Check the error for train and validate
mean_squared_error(y_train.values,y_train_pred)
mean_squared_error(y_val.values,y_val_pred)

0.13914123018670685

In [18]:
#Predict for test data
y_test_pred=reg.predict(X_test_n)

In [19]:
#Check the error
mean_squared_error(y_test.values,y_test_pred)

0.13832383542274773

In [20]:
#Actual prediction on unknown labels
test_df.drop(['helpfulness','outOf'],axis=1,inplace=True)

test_df_n = scaler.transform(test_df) 

y_test_df=reg.predict(test_df_n)

#Store the redicted help ratio
test_df['help_ratio']=y_test_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [21]:
#Create help ratio in original df
new_df['help_ratio']=new_df['helpfulness']/new_df['outOf']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
# Join 
new_df=new_df.join(test_df['help_ratio'],how='left',rsuffix='1')

In [23]:
def final_ratio(x):
    if np.isnan(x['help_ratio']):
        x['help_ratio']=x['help_ratio1']
    return x

In [24]:
#Get the final ratio
new_df=new_df.apply(lambda x: final_ratio(x),axis=1)

In [25]:
new_df.drop('help_ratio1',inplace=True,axis=1)

In [26]:
df=df.join(new_df['help_ratio'],how='left')

In [27]:
df['rating*help_ratio']=df['rating']*df['help_ratio']

In [28]:
df_grouped = df.groupby('asin').sum()

In [29]:
def find_nodeid(x):
    try:
        x['nodeID']=df[df['asin']==x['asin']]['nodeID'].values[0]
    except:
        pass
    return x

In [30]:
df_grouped['nodeid']='s'
df_grouped=df_grouped.reset_index()
df_grouped=df_grouped.apply(lambda x: find_nodeid(x),axis=1)

In [31]:
def get_count(x):
    x['count']=len(df[df['asin']==x['asin']])
    return x

In [32]:
df_grouped['count']=0

In [33]:
df_grouped=df_grouped.reset_index().apply(lambda x: get_count(x),axis=1)

In [34]:
df_grouped['overall_rating1'] = (df_grouped['rating*help_ratio']/df_grouped['count']) + np.log(df_grouped['count'])

In [35]:
df_grouped=df_grouped.sort_values(by='overall_rating1',ascending=False)

In [36]:
categories = pd.read_csv('categories.csv')

In [37]:
df_grouped['level1']='S'
df_grouped['level2']='S'
df_grouped['level3']='S'
df_grouped['level4']='S'
df_grouped['level5']='S'

In [38]:
def find_categories(x):
    try:
        x['level1']=categories[categories['nodeID']==int(x['nodeID'])]['level_1'].values[0]
        x['level2']=categories[categories['nodeID']==int(x['nodeID'])]['level_2'].values[0]
        x['level3']=categories[categories['nodeID']==int(x['nodeID'])]['level_3'].values[0]
        x['level4']=categories[categories['nodeID']==int(x['nodeID'])]['level_4'].values[0]
        x['level5']=categories[categories['nodeID']==int(x['nodeID'])]['level_5'].values[0]
    except:
        try:
            x['level1']=categories[categories['nodeID']==int(x['nodeID'])].iloc[0]['level_1']
            x['level2']=categories[categories['nodeID']==int(x['nodeID'])].iloc[0]['level_2']
            x['level3']=categories[categories['nodeID']==int(x['nodeID'])].iloc[0]['level_3']
            x['level4']=categories[categories['nodeID']==int(x['nodeID'])].iloc[0]['level_4']
            x['level5']=categories[categories['nodeID']==int(x['nodeID'])].iloc[0]['level_5']
        except:
            pass
    return x

In [39]:
df_grouped=df_grouped.apply(lambda x: find_categories(x),axis=1)

In [40]:
df_grouped.drop(['nodeid','index'],inplace=True,axis=1)

In [41]:
df_grouped.columns

Index([u'asin', u'rating', u'helpfulness', u'outOf', u'reviewerCount',
       u'bokReviewCount', u'reviewAge', u'reviewlen', u'numwords',
       u'avgwordlen', u'num_sen', u'num_exc', u'ARI', u'help_ratio',
       u'rating*help_ratio', u'nodeID', u'count', u'overall_rating1',
       u'level1', u'level2', u'level3', u'level4', u'level5'],
      dtype='object')

In [42]:
df_grouped.head()

Unnamed: 0,asin,rating,helpfulness,outOf,reviewerCount,bokReviewCount,reviewAge,reviewlen,numwords,avgwordlen,...,help_ratio,rating*help_ratio,nodeID,count,overall_rating1,level1,level2,level3,level4,level5
3969,7386648,43247,12541,17622,9983,82119844,15662677.0,3688653,663472,37459,...,6874.121396,33211.278223,6343230011,9062,12.77674,History,Modern (16th-21st Centuries),19th Century,,
203,2007770,26398,13200,17695,7084,36012001,14773897.0,2777528,508814,24000,...,4489.910652,20421.643939,8622798011,6001,12.102722,Arts & Photography,Music,Songbooks,Brass,
2332,7124015,20021,14380,22809,5109,18671041,15334443.0,3085468,561706,17275,...,2700.61098,12787.523401,377888011,4321,11.330632,Computers & Technology,Web Development & Design,User Generated Content,,
2728,7167040,3753,1605,2260,1277,619369,3017413.0,568306,100467,3257,...,579.403561,2821.912479,3221491,787,10.253886,"Crafts, Hobbies & Home",Crafts & Hobbies,Scrapbooking,,
3564,7281447,5030,1394,2359,1340,1149184,2358335.0,731161,133316,4367,...,720.711784,3430.37891,282869,1072,10.177262,"Health, Fitness & Dieting",Nutrition,Antioxidants & Phytochemicals,,


In [43]:
conn = psycopg2.connect("dbname='SQLBook' user='postgres' host='localhost' password='postgres'")
cur = conn.cursor()
sql = """select asin,
          round(100*sum(case when month >= 3 and month < 6 then numunits else 0 end)/sum(numunits),2) as spring,
          round(100*sum(case when month >= 6 and month < 9 then numunits else 0 end)/sum(numunits),2) as summer,
          round(100*sum(case when month >= 9 and month < 12 then numunits else 0 end)/sum(numunits),2) as fall,
          round(100*sum(case when (month = 12 or month < 3) then numunits else 0 end)/sum(numunits),2) as winter 
          from 
          (select asin, EXTRACT(MONTH FROM orderdate) as month, case when l.numunits = 0 then 0.00001 else l.numunits end as numunits
          from customers c, orders o, orderlines l, products p
          where c.customerid = o.customerid
          and o.orderid = l.orderid
          and l.productid = p.productid
          ) as temp
          group by asin"""
cur.execute(sql)
rows = cur.fetchall()

In [44]:
seasons = pd.DataFrame(rows,columns=['asin','spring','summer','fall','winter'])

In [51]:
seasons.head()

Unnamed: 0,asin,spring,summer,fall,winter
0,116,41.66,48.83,5.66,3.86
1,868,1.92,0.0,21.98,76.1
2,13714,0.0,0.0,44.69,55.31
3,15393,23.53,47.06,29.41,0.0
4,29831,21.03,19.41,21.72,37.85


In [50]:
df_grouped.head()

Unnamed: 0,asin,rating,helpfulness,outOf,reviewerCount,bokReviewCount,reviewAge,reviewlen,numwords,avgwordlen,...,help_ratio,rating*help_ratio,nodeID,count,overall_rating1,level1,level2,level3,level4,level5
3969,7386648,43247,12541,17622,9983,82119844,15662677.0,3688653,663472,37459,...,6874.121396,33211.278223,6343230011,9062,12.77674,History,Modern (16th-21st Centuries),19th Century,,
203,2007770,26398,13200,17695,7084,36012001,14773897.0,2777528,508814,24000,...,4489.910652,20421.643939,8622798011,6001,12.102722,Arts & Photography,Music,Songbooks,Brass,
2332,7124015,20021,14380,22809,5109,18671041,15334443.0,3085468,561706,17275,...,2700.61098,12787.523401,377888011,4321,11.330632,Computers & Technology,Web Development & Design,User Generated Content,,
2728,7167040,3753,1605,2260,1277,619369,3017413.0,568306,100467,3257,...,579.403561,2821.912479,3221491,787,10.253886,"Crafts, Hobbies & Home",Crafts & Hobbies,Scrapbooking,,
3564,7281447,5030,1394,2359,1340,1149184,2358335.0,731161,133316,4367,...,720.711784,3430.37891,282869,1072,10.177262,"Health, Fitness & Dieting",Nutrition,Antioxidants & Phytochemicals,,


In [53]:
df_grouped=pd.merge(df_grouped,seasons,on='asin',how='left')

In [54]:
df_grouped.columns

Index([u'asin', u'rating', u'helpfulness', u'outOf', u'reviewerCount',
       u'bokReviewCount', u'reviewAge', u'reviewlen', u'numwords',
       u'avgwordlen', u'num_sen', u'num_exc', u'ARI', u'help_ratio',
       u'rating*help_ratio', u'nodeID', u'count', u'overall_rating1',
       u'level1', u'level2', u'level3', u'level4', u'level5', u'spring',
       u'summer', u'fall', u'winter'],
      dtype='object')