In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [125]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score
import cudf
from cuml.feature_extraction.text import TfidfVectorizer

In [126]:
def display_scores(predictions, df):
    accuracy = accuracy_score((predictions > 0.5).astype(int), df['sentiment'].values.get())
    precision = precision_score((predictions > 0.5).astype(int), df['sentiment'].values.get())
    auc = roc_auc_score((predictions > 0.5).astype(int), df['sentiment'].values.get())
    recall = recall_score((predictions > 0.5).astype(int), df['sentiment'].values.get())
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Auc: {auc}')
    print(f'Recall: {recall}')

In [52]:
%%time
sentiment140 = pd.read_csv('../data/external/sentiment140/training.1600000.processed.noemoticon.zip', encoding='latin-1', header=None, names=['sentiment', 'id', 'date', 'no_query', 'user', 'tweet'])

#MOVE DATAFRAME TO GPU
sentiment140 = cudf.from_pandas(sentiment140)
sentiment140 = sentiment140[['tweet', 'user', 'sentiment']]
sentiment140['sentiment'] = (sentiment140['sentiment']==4).astype(int)
print(type(sentiment140))
sentiment140

<class 'cudf.core.dataframe.DataFrame'>
CPU times: user 4.26 s, sys: 200 ms, total: 4.46 s
Wall time: 4.45 s


Unnamed: 0,tweet,user,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",_TheSpecialOne_,0
1,is upset that he can't update his Facebook by ...,scotthamilton,0
2,@Kenichan I dived many times for the ball. Man...,mattycus,0
3,my whole body feels itchy and like its on fire,ElleCTF,0
4,"@nationwideclass no, it's not behaving at all....",Karoli,0
...,...,...,...
1599995,Just woke up. Having no school is the best fee...,AmandaMarie1028,1
1599996,TheWDB.com - Very cool to hear old Walt interv...,TheWDBoards,1
1599997,Are you ready for your MoJo Makeover? Ask me f...,bpbabe,1
1599998,Happy 38th Birthday to my boo of alll time!!! ...,tinydiamondz,1


In [8]:
%%time
vec = TfidfVectorizer(stop_words='english')
tfidf_matrix = vec.fit_transform(sentiment140['tweet'])

CPU times: user 4.97 s, sys: 543 ms, total: 5.52 s
Wall time: 5.53 s


In [12]:
# MOVE SPARSE ROW MATRIX TO CPU BECAUSE THE XGBOOST DMATRIX CONSTRUCTOR DOESN'T SUPPORT THE CUPY SPARSE ROW MATRIX
tfidf_cpu = tfidf_matrix.get()

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(tfidf_cpu, sentiment140['sentiment'], test_size=0.25)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [152]:
params_lr_nought_1 = {'tree_method': 'gpu_hist', 'eval_metric': 'auc', 'objective': 'binary:logistic', 'learning_rate':0.1}
params_lr_nought_3 = {'tree_method': 'gpu_hist', 'eval_metric': 'auc', 'objective': 'binary:logistic', 'learning_rate':0.3}

In [153]:
eval_list = [(dvalid, 'validation'), (dtrain, 'train')]
num_round = 200

In [154]:
%%time
bst_lr_nought_1 = xgb.train(params_lr_nought_1, dtrain, num_round, eval_list)

[0]	validation-auc:0.59333	train-auc:0.59439
[1]	validation-auc:0.60595	train-auc:0.60686
[2]	validation-auc:0.61200	train-auc:0.61297
[3]	validation-auc:0.63267	train-auc:0.63331
[4]	validation-auc:0.63243	train-auc:0.63315
[5]	validation-auc:0.63880	train-auc:0.63960
[6]	validation-auc:0.65321	train-auc:0.65426
[7]	validation-auc:0.65869	train-auc:0.65990
[8]	validation-auc:0.66205	train-auc:0.66325
[9]	validation-auc:0.66237	train-auc:0.66356
[10]	validation-auc:0.66530	train-auc:0.66647
[11]	validation-auc:0.67117	train-auc:0.67222
[12]	validation-auc:0.67364	train-auc:0.67482
[13]	validation-auc:0.67858	train-auc:0.67994
[14]	validation-auc:0.68135	train-auc:0.68267
[15]	validation-auc:0.68707	train-auc:0.68841
[16]	validation-auc:0.68935	train-auc:0.69077
[17]	validation-auc:0.69485	train-auc:0.69641
[18]	validation-auc:0.69877	train-auc:0.70035
[19]	validation-auc:0.69906	train-auc:0.70065
[20]	validation-auc:0.70122	train-auc:0.70277
[21]	validation-auc:0.70122	train-auc:0.7027

In [157]:
%%time
bst_lr_nought_3 = xgb.train(params_lr_nought_3, dtrain, num_round, eval_list)

[0]	validation-auc:0.59333	train-auc:0.59439
[1]	validation-auc:0.62009	train-auc:0.62106
[2]	validation-auc:0.64635	train-auc:0.64695
[3]	validation-auc:0.66263	train-auc:0.66390
[4]	validation-auc:0.67398	train-auc:0.67515
[5]	validation-auc:0.67734	train-auc:0.67849
[6]	validation-auc:0.68832	train-auc:0.68945
[7]	validation-auc:0.69871	train-auc:0.69980
[8]	validation-auc:0.70585	train-auc:0.70745
[9]	validation-auc:0.70954	train-auc:0.71128
[10]	validation-auc:0.71246	train-auc:0.71430
[11]	validation-auc:0.71532	train-auc:0.71720
[12]	validation-auc:0.71844	train-auc:0.72033
[13]	validation-auc:0.72594	train-auc:0.72785
[14]	validation-auc:0.73010	train-auc:0.73211
[15]	validation-auc:0.73773	train-auc:0.73979
[16]	validation-auc:0.74073	train-auc:0.74272
[17]	validation-auc:0.74436	train-auc:0.74638
[18]	validation-auc:0.74656	train-auc:0.74872
[19]	validation-auc:0.74987	train-auc:0.75234
[20]	validation-auc:0.75423	train-auc:0.75678
[21]	validation-auc:0.75468	train-auc:0.7573

In [163]:
%%time
bst_lr_nought_3_400_round = xgb.train(params_lr_nought_3, dtrain, 400, eval_list)

[0]	validation-auc:0.59333	train-auc:0.59439
[1]	validation-auc:0.62009	train-auc:0.62106
[2]	validation-auc:0.64635	train-auc:0.64695
[3]	validation-auc:0.66263	train-auc:0.66390
[4]	validation-auc:0.67398	train-auc:0.67515
[5]	validation-auc:0.67734	train-auc:0.67849
[6]	validation-auc:0.68832	train-auc:0.68945
[7]	validation-auc:0.69871	train-auc:0.69980
[8]	validation-auc:0.70585	train-auc:0.70745
[9]	validation-auc:0.70954	train-auc:0.71128
[10]	validation-auc:0.71246	train-auc:0.71430
[11]	validation-auc:0.71532	train-auc:0.71720
[12]	validation-auc:0.71844	train-auc:0.72033
[13]	validation-auc:0.72594	train-auc:0.72785
[14]	validation-auc:0.73010	train-auc:0.73211
[15]	validation-auc:0.73773	train-auc:0.73979
[16]	validation-auc:0.74073	train-auc:0.74272
[17]	validation-auc:0.74436	train-auc:0.74638
[18]	validation-auc:0.74656	train-auc:0.74872
[19]	validation-auc:0.74987	train-auc:0.75234
[20]	validation-auc:0.75423	train-auc:0.75678
[21]	validation-auc:0.75468	train-auc:0.7573

In [72]:
test140 = pd.read_csv('../data/external/sentiment140/testdata.manual.2009.06.14.csv', encoding='latin-1', header=None, names=['sentiment', 'id', 'date', 'no_query', 'user', 'tweet'])
test140 = cudf.from_pandas(test140)
test140 = test140[['tweet', 'user', 'sentiment']]
test140['sentiment'] = (test140['sentiment']==4).astype(int)
tfidf_test = vec.transform(test140['tweet'])
test_cpu = tfidf_test.get()
dtest = xgb.DMatrix(test_cpu)

In [164]:
predictions_lr_nought_1 = bst_lr_nought_1.predict(dtest)
predictions_lr_nought_3 = bst_lr_nought_3.predict(dtest)
predictions_lr_nought_3_400_round = bst_lr_nought_3_400_round.predict(dtest)

In [167]:
display_scores(predictions_lr_nought_1, test140)

Accuracy: 0.5421686746987951
Precision: 0.8681318681318682
Auc: 0.6299967500812479
Recall: 0.43646408839779005


In [166]:
display_scores(predictions_lr_nought_3, test140)

Accuracy: 0.5923694779116466
Precision: 0.8901098901098901
Auc: 0.6694231315714692
Recall: 0.46956521739130436


In [165]:
display_scores(predictions_lr_nought_3_400_round, test140)

Accuracy: 0.6124497991967871
Precision: 0.8681318681318682
Auc: 0.6714147754707871
Recall: 0.4831804281345566


In [179]:
bst_lr_nought_1.save_model('../models/xgboost/bst_lr_nought_1.save')
bst_lr_nought_1.dump_model('../models/xgboost/bst_lr_nought_1.dump')

In [180]:
bst_lr_nought_3.save_model('../models/xgboost/bst_lr_nought_3.save')
bst_lr_nought_3.dump_model('../models/xgboost/bst_lr_nought_3.dump')

In [181]:
bst_lr_nought_3_400_round.save_model('../models/xgboost/bst_lr_nought_3_400_round.save')
bst_lr_nought_3_400_round.dump_model('../models/xgboost/bst_lr_nought_3_400_round.dump')

In [188]:
train_kaggle = pd.read_csv('../data/external/kaggle/train.csv')
train_kaggle = cudf.from_pandas(train_kaggle)
tfidf_kaggle_train = vec.transform(train_kaggle['text'])
tfidf_kaggle_train_cpu = tfidf_kaggle_train.get()
dkaggle_train = xgb.DMatrix(tfidf_kaggle_train_cpu)

In [203]:
train_sent_lr_nought_1 = bst_lr_nought_1.predict(dkaggle_train)
train_sent_lr_nought_3 = bst_lr_nought_3.predict(dkaggle_train)
train_sent_lr_nought_3_400_round = bst_lr_nought_3_400_round.predict(dkaggle_train)

In [196]:
test_kaggle = pd.read_csv('../data/external/kaggle/test.csv')
test_kaggle = cudf.from_pandas(test_kaggle)
tfidf_kaggle_test = vec.transform(test_kaggle['text'])
tfidf_kaggle_test_cpu = tfidf_kaggle_test.get()
dkaggle_test = xgb.DMatrix(tfidf_kaggle_test_cpu)

In [197]:
test_sent_lr_nought_1 = bst_lr_nought_1.predict(dkaggle_test)
test_sent_lr_nought_3 = bst_lr_nought_3.predict(dkaggle_test)
test_sent_lr_nought_3_400_round = bst_lr_nought_3_400_round.predict(dkaggle_test)

In [205]:
with open('../data/features/train_sent_lr_nought_1.npy', 'wb') as f:
    np.save(f, train_sent_lr_nought_1)
with open('../data/features/train_sent_lr_nought_3.npy', 'wb') as f:
    np.save(f, train_sent_lr_nought_3_400_round)
with open('../data/features/train_sent_lr_nought_3_400_round.npy', 'wb') as f:
    np.save(f, train_sent_lr_nought_3_400_round)
with open('../data/features/test_sent_lr_nought_1.npy', 'wb') as f:
    np.save(f, test_sent_lr_nought_1)
with open('../data/features/test_sent_lr_nought_3.npy', 'wb') as f:
    np.save(f, test_sent_lr_nought_3)
with open('../data/features/test_sent_lr_nought_3_400_round.npy', 'wb') as f:
    np.save(f, test_sent_lr_nought_3_400_round)