In [1]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from langdetect import detect
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Imputer, FunctionTransformer
import nltk
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook, show
output_notebook()

In [2]:
df= pd.read_csv('../Cleaned_Data/cleaned_data_v1_1000.csv',index_col=False)

In [3]:
alpha = 10**np.linspace(10,-2,100)*0.5

In [4]:
def lr(X_train, X_test, y_train, y_test):
    regr_cv = LinearRegression()
    lr = regr_cv.fit(X_train, y_train)
    return lr.score(X_test, y_test)

In [5]:
def ridge(X_train, X_test, y_train, y_test):
    regr_cv = RidgeCV(alphas=alpha, normalize=True)
    ridge = regr_cv.fit(X_train, y_train)
    return ridge.score(X_test, y_test)

In [6]:
def lasso(X_train, X_test, y_train, y_test):
    regr_cv = LassoCV(alphas=alpha, normalize=True)
    lasso = regr_cv.fit(X_train, y_train)
    return lasso.score(X_test, y_test)

In [7]:
def rf(X_train, X_test, y_train, y_test):
    regr_cv = RandomForestRegressor(n_estimators=10)
    rf = regr_cv.fit(X_train, y_train)
    return rf.score(X_test, y_test)


## CountVectorizer：Only use text data to predict

In [8]:
lr_rsqure=[]
ridge_rsqure=[]
lasso_rsqure=[]
rf_rsqure=[]
for mindf in range(1,10):
    count_vect = CountVectorizer(min_df=mindf)
    X = count_vect.fit_transform(df['text'].values.astype('U'))
    X_train, X_test, y_train, y_test = train_test_split(X, df['useful'], test_size=0.33, random_state=42)
    lr_rsqure.append(lr(X_train, X_test, y_train, y_test))
    ridge_rsqure.append(ridge(X_train, X_test, y_train, y_test))
    lasso_rsqure.append(lasso(X_train, X_test, y_train, y_test))
    rf_rsqure.append(rf(X_train, X_test, y_train, y_test))

In [9]:
p1 = figure(plot_width=800, plot_height=400)
p1.xaxis.axis_label = 'Min_df'
p1.yaxis.axis_label = 'Coefficient of determination'
p1.line(range(1,10), lr_rsqure, color='firebrick', legend='Linear Regression')
p1.line(range(1,10), ridge_rsqure, color='navy', legend='Ridge')
p1.line(range(1,10), lasso_rsqure, color='olive', legend='Lasso')
p1.line(range(1,10), rf_rsqure, color='orange', legend='Random Forest')
p1.legend.location = "bottom_left"
show(p1)

## TfidfVectorizer：Only use text data to predict

In [10]:
lr_rsqure=[]
ridge_rsqure=[]
lasso_rsqure=[]
rf_rsqure=[]
for mindf in range(1,10):
    count_vect = TfidfVectorizer(min_df=mindf, ngram_range=(1,3))
    X = count_vect.fit_transform(df['text'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['useful'], test_size=0.33, random_state=42)
    lr_rsqure.append(lr(X_train, X_test, y_train, y_test))
    ridge_rsqure.append(ridge(X_train, X_test, y_train, y_test))
    lasso_rsqure.append(lasso(X_train, X_test, y_train, y_test))
    rf_rsqure.append(rf(X_train, X_test, y_train, y_test))

In [11]:
p1 = figure(plot_width=800, plot_height=400)
p1.xaxis.axis_label = 'Min_df'
p1.yaxis.axis_label = 'Accuracy'
p1.line(range(1,10), lr_rsqure, color='firebrick', legend='Linear Regression')
p1.line(range(1,10), ridge_rsqure, color='navy', legend='Ridge')
p1.line(range(1,10), lasso_rsqure, color='olive', legend='Lasso')
p1.line(range(1,10), rf_rsqure, color='orange', legend='Random Forest')
p1.legend.location = "bottom_left"
show(p1)

## Only use numeric data to predict

In [12]:
X = df[['stars', 'user_review_count', 'friends', 'user_total_useful', 'total_funny',
       'total_cool', 'user_average_stars', 'business_stars',
       'business_review_count', 'days', 'text_count', 'pol',
       'user_avg_useful']]
X_train, X_test, y_train, y_test = train_test_split(X, df['useful'], test_size=0.3, 
                                                    random_state=42)
methods = ['Linear Regression', 'Ridge', 'Lasso', 'Random Forest']
accuracy = []
accuracy.append(lr(X_train, X_test, y_train, y_test))
accuracy.append(ridge(X_train, X_test, y_train, y_test))
accuracy.append(lasso(X_train, X_test, y_train, y_test))
accuracy.append(rf(X_train, X_test, y_train, y_test))



In [13]:
p = figure(x_range=methods, plot_height=250, title="Only use numeric data 
           to predict the review usefulness",
           toolbar_location=None, tools="")
p.vbar(x=methods, top=accuracy, width=0.3)
show(p)

In [14]:
regr_cv = RandomForestRegressor(n_estimators=30)
rf = regr_cv.fit(X_train, y_train)
print(rf.score(X_test, y_test))

print(rf.feature_importances_)
feature_list = list(X.columns)
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

0.746420011922
[ 0.01369231  0.01458869  0.0322582   0.30998182  0.08555404  0.0206155
  0.00521209  0.00733255  0.02065299  0.13243011  0.11137288  0.01215914
  0.23414967]
Variable: user_total_useful    Importance: 0.31
Variable: user_avg_useful      Importance: 0.23
Variable: days                 Importance: 0.13
Variable: text_count           Importance: 0.11
Variable: total_funny          Importance: 0.09
Variable: friends              Importance: 0.03
Variable: total_cool           Importance: 0.02
Variable: business_review_count Importance: 0.02
Variable: stars                Importance: 0.01
Variable: user_review_count    Importance: 0.01
Variable: user_average_stars   Importance: 0.01
Variable: business_stars       Importance: 0.01
Variable: pol                  Importance: 0.01


## Combine text and numeric data

In [15]:
df_ml = df[['stars', 'text', 'useful', 'user_review_count', 'friends', 'user_total_useful', 'total_funny', 
            'total_cool', 'user_average_stars', 'business_stars','business_review_count', 'days', 'text_count', 
            'pol', 'user_avg_useful']]

# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[['stars', 'user_review_count', 'friends', 'user_total_useful', 
                                                    'total_funny', 'total_cool', 'user_average_stars', 
                                                    'business_stars','business_review_count', 'days', 
                                                    'text_count', 'pol', 'user_avg_useful']], validate=False)



# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(df_ml)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(df_ml)

# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(df_ml[['stars', 'text', 'useful', 'user_review_count', 
                                                           'friends', 'user_total_useful', 'total_funny', 
                                                           'total_cool', 'user_average_stars', 'business_stars',
                                                           'business_review_count', 'days', 'text_count', 'pol', 
                                                           'user_avg_useful']], 
                                                    df_ml['useful'] , 
                                                    random_state=22)

CountVectorizer

In [16]:
lr_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', LinearRegression())
    ])

ridge_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', RidgeCV(alphas=alpha, normalize=True))
    ])

lasso_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', LassoCV(alphas=alpha, normalize=True))
    ])

rf_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', RandomForestRegressor(random_state = 42))
    ])

min_df = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ngram_range=[(1, 2), (2, 3)]
param_grid = {'union__text_features__vectorizer__min_df': min_df,
              'union__text_features__vectorizer__ngram_range': ngram_range}
n_estimators = range(10, 200, 10)
param_grid_rf = {'union__text_features__vectorizer__min_df': min_df,
                 'union__text_features__vectorizer__ngram_range': ngram_range,
                 'clf__n_estimators': n_estimators}

In [17]:
grid = GridSearchCV(lr_pipe, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)
print("Linear Regression Best: %f using %s" % (grid.best_score_, grid.best_params_))

grid = GridSearchCV(ridge_pipe, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)
print("Ridge Best: %f using %s" % (grid.best_score_, grid.best_params_))

grid = GridSearchCV(lasso_pipe, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)
print("Lasso Best: %f using %s" % (grid.best_score_, grid.best_params_))

grid = GridSearchCV(rf_pipe, cv=3, param_grid=param_grid_rf)
grid.fit(X_train,y_train)
print("Random Forest Best: %f using %s" % (grid.best_score_, grid.best_params_))

Linear Regression Best: 0.301656 using {'union__text_features__vectorizer__min_df': 1, 'union__text_features__vectorizer__ngram_range': (2, 3)}
Ridge Best: 0.302437 using {'union__text_features__vectorizer__min_df': 7, 'union__text_features__vectorizer__ngram_range': (2, 3)}
Lasso Best: 0.315443 using {'union__text_features__vectorizer__min_df': 10, 'union__text_features__vectorizer__ngram_range': (2, 3)}
Random Forest Best: 0.764272 using {'clf__n_estimators': 40, 'union__text_features__vectorizer__min_df': 10, 'union__text_features__vectorizer__ngram_range': (2, 3)}


TfidfVectorizer

In [18]:
lr_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', TfidfVectorizer())
                ]))
             ]
        )),
        ('clf', LinearRegression())
    ])

ridge_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', TfidfVectorizer())
                ]))
             ]
        )),
        ('clf', RidgeCV(alphas=alpha, normalize=True))
    ])

lasso_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', TfidfVectorizer())
                ]))
             ]
        )),
        ('clf', LassoCV(alphas=alpha, normalize=True))
    ])

rf_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', TfidfVectorizer())
                ]))
             ]
        )),
        ('clf', RandomForestRegressor(random_state = 42))
    ])

In [34]:
min_df = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ngram_range=[(1, 2), (2, 3)]
param_grid = {'union__text_features__vectorizer__min_df': min_df,
              'union__text_features__vectorizer__ngram_range': ngram_range}
n_estimators = range(10, 200, 10)
param_grid_rf = {'union__text_features__vectorizer__min_df': min_df,
                 'union__text_features__vectorizer__ngram_range': ngram_range,
                 'clf__n_estimators': n_estimators}

grid = GridSearchCV(lr_pipe, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)
print("Linear Regression Best: %f using %s" % (grid.best_score_, grid.best_params_))

grid = GridSearchCV(ridge_pipe, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)
print("Ridge Best: %f using %s" % (grid.best_score_, grid.best_params_))

grid = GridSearchCV(lasso_pipe, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)
print("Lasso Best: %f using %s" % (grid.best_score_, grid.best_params_))

grid = GridSearchCV(rf_pipe, cv=3, param_grid=param_grid_rf)
grid.fit(X_train,y_train)
print("Random Forest Best: %f using %s" % (grid.best_score_, grid.best_params_))

Linear Regression Best: 0.322766 using {'union__text_features__vectorizer__min_df': 1, 'union__text_features__vectorizer__ngram_range': (2, 3)}
Ridge Best: 0.282021 using {'union__text_features__vectorizer__min_df': 10, 'union__text_features__vectorizer__ngram_range': (2, 3)}
Lasso Best: 0.313479 using {'union__text_features__vectorizer__min_df': 10, 'union__text_features__vectorizer__ngram_range': (2, 3)}
Random Forest Best: 0.756388 using {'clf__n_estimators': 180, 'union__text_features__vectorizer__min_df': 10, 'union__text_features__vectorizer__ngram_range': (2, 3)}
