In [1]:
from nep_loss import multiclass_logloss
from sklearn.metrics import accuracy_score

# Data Encoding and Tokenizing

In [2]:
import pickle

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [4]:
with open('../data.nosync/ebert_test_train_data.pkl', 'rb') as f:
    train_data, test_data = pickle.load(f)

In [5]:
train_data, valid_data = train_test_split(train_data, stratify=train_data['stars'], test_size=0.1, random_state=42)

In [6]:
X_train = train_data['review']
y_train = train_data['stars']
X_valid = valid_data['review']
y_valid = valid_data['stars']
X_test = test_data['review']
y_test = test_data['stars']

len(X_train), len(y_train), len(X_valid), len(y_valid), len(X_test), len(y_test)

(5204, 5204, 579, 579, 1928, 1928)

In [7]:
lbl_enc = preprocessing.LabelEncoder()
y_encoded = lbl_enc.fit_transform(list(y_train) + list(y_valid) + list(y_test))
y_train_enc = y_encoded[0:len(y_train)]
y_valid_enc = y_encoded[len(y_train):len(y_train) + len(y_valid)]
y_test_enc = y_encoded[len(y_train) + len(y_valid):len(y_train) + len(y_valid) + len(y_test)]

# TF-IDF

https://www.kaggle.com/code/abhishek/approaching-almost-any-nlp-problem-on-kaggle/notebook

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(X_train) + list(X_valid))
xtrain_tfv =  tfv.transform(X_train) 
xvalid_tfv = tfv.transform(X_valid)


KeyboardInterrupt



## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0, max_iter=1000)
clf.fit(xtrain_tfv, y_train_enc)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'w') as f:
    f.write("Logistic Regression logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))

## Word Counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(X_train) + list(X_valid))
xtrain_ctv =  ctv.transform(X_train) 
xvalid_ctv = ctv.transform(X_valid)

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, y_train_enc)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'a') as f:
    f.write("Logistic Regression + Word Counts logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
    f.write("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, y_train_enc)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'a') as f:
    f.write("Naive Bayes + TF-IDF logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
    f.write("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

## Naive Bayes + Word Count

In [None]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, y_train_enc)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'a') as f:
    f.write("Naive Bayes + Word Counts logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
    f.write("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

## SVM

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [None]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y_train_enc)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'a') as f:
    f.write("SVM + TF-IDF logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
    f.write("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), y_train_enc)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'a') as f:
    f.write("XGBoost + TF-IDF logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
    f.write("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

## XGBoost + SVD

In [None]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, y_train_enc)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

In [None]:
with open('scores.txt', 'a') as f:
    f.write("XGBoost + SVD logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
    f.write("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

# Grid Search

## SVD + Scaling + Logistic Regression

In [None]:
from sklearn import preprocessing, metrics, pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [None]:
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()
clf = pipeline.Pipeline(
    [
        ('svd', svd),
        ('scl', scl),
        ('lr', lr_model)
    ]
)

In [None]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2'],
              'lr__max_iter': [100, 1000],
             }

In [None]:
# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1,
#                      iid=True,
                     refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, y_train_enc)  # we can use the full data here but im only using xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
# clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
#                         subsample=0.8, nthread=10, learning_rate=0.1)

xbg_model = xgb.XGBClassifier(nthread=-1)

# Create the pipeline 
clf = pipeline.Pipeline([('xgb', xgb_model)])

# parameter grid
param_grid = {
    'xgb__max_depth': list(range(10)),
    'xgb__n_estimators': [10, 50, 100, 200, 500],
    'xgb__colsample_bytree': [0.1, 0.3, 0.5, 0.8, 0.9],
    'xgb__subsample': [0.1, 0.3, 0.5, 0.8, 0.9],
    'xgb__learning_rate': [0.001, 0.01, 0.1, 0.5],
}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1,
#                      iid=True,
                     refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, y_train_enc)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1,
#                      iid=True,
                     refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, y_train_enc)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Word Vectors

## Glove Vectors

In [13]:
from tqdm import tqdm
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [14]:
# from http://www-nlp.stanford.edu/data/glove.840B.300d.zip

embeddings_index = {}
with open('../../glove.840B.300d.txt') as f:
    i = 0
    for line in tqdm(f):
        try:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except Exception as e:
            print(i)
            print(values[:10])
        i += 1
        
print("Found %s words vectors" % len(embeddings_index))

54872it [00:02, 19726.84it/s]

52343
['.', '.', '.', '-0.1573', '-0.29517', '0.30453', '-0.54773', '0.098293', '-0.1776', '0.21662']


130611it [00:06, 18150.10it/s]

128261
['at', 'name@domain.com', '0.0061218', '0.39595', '-0.22079', '0.78149', '0.38759', '0.28888', '0.18495', '-0.37328']


153588it [00:08, 18882.07it/s]

151102
['.', '.', '.', '.', '.', '-0.23773', '-0.82788', '0.82326', '-0.91878', '0.35868']


203035it [00:10, 19856.13it/s]

200668
['to', 'name@domain.com', '0.33865', '0.12698', '-0.16885', '0.55476', '0.48296', '0.45018', '0.0094233', '-0.36575']


212827it [00:11, 18876.39it/s]

209833
['.', '.', '0.035974', '-0.024421', '0.71402', '-0.61127', '0.012771', '-0.11201', '0.16847', '-0.14069']


224593it [00:11, 19533.95it/s]

220779
['.', '.', '.', '.', '0.033459', '-0.085658', '0.27155', '-0.56132', '0.60419', '-0.027276']


255480it [00:13, 18831.21it/s]

253461
['email', 'name@domain.com', '0.33529', '0.32949', '0.2646', '0.64219', '0.70701', '-0.074487', '-0.066128', '-0.30804']


369454it [00:19, 18549.11it/s]

365745
['or', 'name@domain.com', '0.48374', '0.49669', '-0.25089', '0.90389', '0.60307', '0.11141', '-0.021157', '0.10037']


535418it [00:28, 19235.04it/s]

532048
['contact', 'name@domain.com', '0.016426', '0.13728', '0.18781', '0.75784', '0.44012', '0.096794', '0.060987', '0.31293']


720716it [00:38, 17539.57it/s]

717302
['Email', 'name@domain.com', '0.37344', '0.024573', '-0.12583', '0.36009', '0.25605', '0.07326', '0.3292', '-0.0037022']


996308it [00:55, 14034.30it/s]

994818
['on', 'name@domain.com', '0.037295', '-0.15381', '-0.045189', '1.0566', '0.42898', '0.24093', '0.34305', '-0.090393']


1125347it [01:02, 18387.97it/s]

1123331
['At', 'Killerseats.com', '-0.13854', '-0.01706', '-0.13651', '0.1237', '0.15633', '-0.16556', '0.29374', '-0.064174']


1150408it [01:03, 19301.27it/s]

1148409
['by', 'name@domain.com', '0.6882', '-0.36436', '0.62079', '1.1482', '-0.055475', '-0.37936', '0.0064471', '-0.33046']


1355175it [01:15, 18711.99it/s]

1352110
['in', 'mylot.com', '-0.18148', '0.47096', '0.32916', '0.044196', '-0.93045', '-0.16299', '0.31996', '0.39017']


1502174it [01:23, 18896.38it/s]

1499727
['emailing', 'name@domain.com', '0.39173', '-0.39132', '-0.4266', '0.82429', '0.42919', '0.17601', '0.16663', '-0.011601']


1535688it [01:25, 18071.19it/s]

1533809
['Contact', 'name@domain.com', '0.14933', '-0.28605', '0.3444', '0.29015', '-0.22999', '0.1271', '0.35722', '0.35118']


1902210it [01:44, 19204.70it/s]

1899841
['at', 'name@domain.com', '0.44321', '-0.40005', '-0.20065', '1.1209', '0.34041', '0.086082', '-0.067128', '0.0022702']


1924886it [01:46, 18594.47it/s]

1921152
['•', 'name@domain.com', '-0.13288', '-0.31383', '-0.032356', '0.52036', '-0.26985', '0.43339', '0.32587', '-0.51581']


2061247it [01:53, 19397.16it/s]

2058966
['at', 'Amazon.com', '-0.5275', '-0.73685', '0.10968', '0.22214', '-0.30063', '-0.63201', '-0.053204', '-0.16241']


2168904it [01:59, 19252.00it/s]

2165246
['is', 'name@domain.com', '-0.1197', '0.10706', '-0.10519', '-0.12412', '0.4096', '-0.0287', '0.34704', '0.3549']


2196017it [02:00, 18235.89it/s]

Found 2195884 words vectors





In [17]:
def sentence2vector(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    else:
        return v / np.sqrt((v ** 2).sum())

In [18]:
xtrain_glove = [sentence2vector(x) for x in X_train]
xvalid_glove = [sentence2vector(x) for x in X_valid]

In [20]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [23]:
import xgboost as xgb

In [26]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, y_train_enc)
predictions = clf.predict_proba(xvalid_glove)

print("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

Parameters: { "silent" } are not used.



logloss: 1.881 
accuracy: 0.364


In [28]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, y_train_enc)
predictions = clf.predict_proba(xvalid_glove)

print("logloss: %0.3f " % multiclass_logloss(y_valid_enc, predictions))
print("accuracy: %0.3f" % accuracy_score(y_valid_enc, np.argmax(predictions, axis=1)))

Parameters: { "silent" } are not used.



logloss: 1.749 
accuracy: 0.387
