In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB

# Read Data
df = shuffle(pd.read_csv('drive/My Drive/Colab Notebooks/dataset/yelp2013train.ss',sep='\t\t', names=["user", "business(product)", "rating", "review"])).reset_index()
df = df[0:60000]

# Pre- processing 
tfidf_vectorizer = TfidfVectorizer(max_df=0.1,smooth_idf=False, max_features=None,
                   stop_words=['english'],sublinear_tf=True, tokenizer=None, use_idf=True)
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df['review'])
X_train = tfidf

# Target labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['rating'])
y_train = df['label'].to_numpy()

# # Feature reduction
# select_best = SelectPercentile(chi2, percentile=23)
# X_train = select_best.fit_transform(tfidf, y_train)


models = [
       LogisticRegression(C=1, penalty='l1'),
       LinearSVC(C=1),
       MultinomialNB(alpha=1.0)
]

# Cross validation
Cross_validation_fold = 5
cv_df = pd.DataFrame(index=range(Cross_validation_fold * len(models)))
entries = []
for model in models:
  model_name = type(model).__name__
  accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy',n_jobs=-1, cv=Cross_validation_fold)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_index', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC             0.501750
LogisticRegression    0.529817
MultinomialNB         0.423933
Name: accuracy, dtype: float64

**Preprocessing**


In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


# Read Data
df = shuffle(pd.read_csv('drive/My Drive/Colab Notebooks/dataset/yelp2013train.ss',sep='\t\t', names=["user", "business(product)", "rating", "review"])).reset_index()
df = df[0:60000]

# Pre- processing 
tfidf_vectorizer = TfidfVectorizer(max_df=0.1,smooth_idf=False, max_features=None,
                   stop_words=['english'],sublinear_tf=True, tokenizer=None, use_idf=True)
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df['review'])
# X_train = tfidf

# Target labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['rating'])
y = df['label'].to_numpy()

# Feature reduction
select_best = SelectPercentile(chi2, percentile=23)
X = select_best.fit_transform(tfidf, y_train)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.20)




**Hyperparameter** **tuning**

In [0]:
LR = LogisticRegression(solver='lbfgs', multi_class='multinomial',  max_iter=1000)

# Create regularization hyperparameter space
C = np.linspace(0,5,20)

# Create hyperparameter options
hyperparameters = dict(C=C)
clf = GridSearchCV(LR, hyperparameters, cv=5, verbose=0)
best_model = clf.fit(X_val, y_val)
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [45]:
SVM =  LinearSVC()
# Create regularization hyperparameter space
C = np.linspace(1,1.5,30)

# Create hyperparameter options
hyperparameters = dict(C=C)
clf = GridSearchCV(SVM, hyperparameters, cv=5, verbose=0)
best_model_svm = clf.fit(X_val, y_val)

print('Best C:', best_model_svm.best_estimator_.get_params()['C'])

Best C: 1.0


In [32]:
MNB = MultinomialNB()
# Create regularization hyperparameter space
alpha = np.linspace(1,1.2,20)

# Create hyperparameter options
hyperparameters = dict(alpha=alpha)
clf = GridSearchCV(MNB, hyperparameters, cv=5, verbose=0)
best_model_mnb = clf.fit(X_val, y_val)

print('Best alpha:', best_model_mnb.best_estimator_.get_params()['alpha'])

Best alpha: 1.0
