In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as sklm
import xgboost as xgb
import matplotlib
import lightgbm as lgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submissions = pd.read_csv('SampleSubmission.csv')

In [3]:
print(train.shape)
print(test.shape)

(10001, 4)
(5177, 2)


In [4]:
test = test.fillna('It was a good movie')
test.isna().sum()

tweet_id     0
safe_text    0
dtype: int64

In [5]:
train.isna().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [6]:
y = np.array(train['label'])

In [7]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [8]:
combined = list(train.safe_text.values) + list(test.safe_text.values)
len(combined)

15178

In [9]:
x_train = np.array(combined[:10001])
print(len(x_train))
x_test = np.array(combined[10001:])
print(len(x_test))
x_test[:2]

10001
5177


array(['<user> <user> ... &amp; 4 a vaccine given 2 healthy peeps, FDA think just not worth the AE risk unfortunately.',
       'Students starting school without whooping cough vaccinations <url> #scpick'],
      dtype='<U152')

In [10]:
import numpy.random as nr
import sklearn.model_selection as ms
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(np.array(x_train.shape[0]))
indx = ms.train_test_split(indx, test_size = 0.2)
x_train1 = x_train[indx[0]]
y_train1 = np.ravel(y[indx[0]])
x_test1 = x_train[indx[1]]
y_test1 = np.ravel(y[indx[1]])

In [11]:
print(x_train1.shape)
print(y_train1.shape)
print(x_test1.shape)
print(y_test1.shape)

(8000,)
(8000,)
(2001,)
(2001,)


In [12]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,1), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(x_train)
x_train_tfv =  tfv.transform(x_train1)
x_test_tfv = tfv.transform(x_test1)


test_enc = tfv.transform(test.safe_text.values)

In [13]:
print(x_train_tfv.shape)
print(y_train1.shape)
print(x_test_tfv.shape)

(8000, 4047)
(8000,)
(2001, 4047)


In [14]:
x_train_tfv

<8000x4047 sparse matrix of type '<class 'numpy.float64'>'
	with 71792 stored elements in Compressed Sparse Row format>

In [31]:
lg = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.01, n_estimators=3000, max_depth=12)
eval_set = [(x_test_tfv, y_test1)]
lg.fit(x_train_tfv, y_train1, eval_set=eval_set, early_stopping_rounds=100)

[1]	valid_0's l2: 0.424953
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.423734
[3]	valid_0's l2: 0.422545
[4]	valid_0's l2: 0.421333
[5]	valid_0's l2: 0.4202
[6]	valid_0's l2: 0.419073
[7]	valid_0's l2: 0.417994
[8]	valid_0's l2: 0.416882
[9]	valid_0's l2: 0.415839
[10]	valid_0's l2: 0.414814
[11]	valid_0's l2: 0.413847
[12]	valid_0's l2: 0.41287
[13]	valid_0's l2: 0.411853
[14]	valid_0's l2: 0.410955
[15]	valid_0's l2: 0.410048
[16]	valid_0's l2: 0.409164
[17]	valid_0's l2: 0.408258
[18]	valid_0's l2: 0.407428
[19]	valid_0's l2: 0.406586
[20]	valid_0's l2: 0.405785
[21]	valid_0's l2: 0.404947
[22]	valid_0's l2: 0.404173
[23]	valid_0's l2: 0.403448
[24]	valid_0's l2: 0.402648
[25]	valid_0's l2: 0.401893
[26]	valid_0's l2: 0.401137
[27]	valid_0's l2: 0.400423
[28]	valid_0's l2: 0.399686
[29]	valid_0's l2: 0.39899
[30]	valid_0's l2: 0.398309
[31]	valid_0's l2: 0.397656
[32]	valid_0's l2: 0.396934
[33]	valid_0's l2: 0.396211
[34]	valid_0's l2: 0.395611

[286]	valid_0's l2: 0.353829
[287]	valid_0's l2: 0.353806
[288]	valid_0's l2: 0.353769
[289]	valid_0's l2: 0.353708
[290]	valid_0's l2: 0.353667
[291]	valid_0's l2: 0.35363
[292]	valid_0's l2: 0.353593
[293]	valid_0's l2: 0.353555
[294]	valid_0's l2: 0.353533
[295]	valid_0's l2: 0.353504
[296]	valid_0's l2: 0.353492
[297]	valid_0's l2: 0.353485
[298]	valid_0's l2: 0.353443
[299]	valid_0's l2: 0.35341
[300]	valid_0's l2: 0.353381
[301]	valid_0's l2: 0.353372
[302]	valid_0's l2: 0.35333
[303]	valid_0's l2: 0.353261
[304]	valid_0's l2: 0.353209
[305]	valid_0's l2: 0.353189
[306]	valid_0's l2: 0.353181
[307]	valid_0's l2: 0.35315
[308]	valid_0's l2: 0.353143
[309]	valid_0's l2: 0.35312
[310]	valid_0's l2: 0.353098
[311]	valid_0's l2: 0.353091
[312]	valid_0's l2: 0.353061
[313]	valid_0's l2: 0.353027
[314]	valid_0's l2: 0.353002
[315]	valid_0's l2: 0.352924
[316]	valid_0's l2: 0.352885
[317]	valid_0's l2: 0.352864
[318]	valid_0's l2: 0.352895
[319]	valid_0's l2: 0.352892
[320]	valid_0's l2:

[569]	valid_0's l2: 0.34888
[570]	valid_0's l2: 0.348847
[571]	valid_0's l2: 0.348841
[572]	valid_0's l2: 0.348798
[573]	valid_0's l2: 0.348791
[574]	valid_0's l2: 0.348753
[575]	valid_0's l2: 0.348739
[576]	valid_0's l2: 0.348707
[577]	valid_0's l2: 0.348697
[578]	valid_0's l2: 0.348679
[579]	valid_0's l2: 0.348689
[580]	valid_0's l2: 0.348689
[581]	valid_0's l2: 0.348702
[582]	valid_0's l2: 0.348682
[583]	valid_0's l2: 0.348673
[584]	valid_0's l2: 0.348668
[585]	valid_0's l2: 0.348676
[586]	valid_0's l2: 0.348665
[587]	valid_0's l2: 0.348659
[588]	valid_0's l2: 0.348674
[589]	valid_0's l2: 0.348667
[590]	valid_0's l2: 0.348665
[591]	valid_0's l2: 0.348632
[592]	valid_0's l2: 0.348643
[593]	valid_0's l2: 0.348653
[594]	valid_0's l2: 0.348635
[595]	valid_0's l2: 0.348611
[596]	valid_0's l2: 0.348591
[597]	valid_0's l2: 0.348574
[598]	valid_0's l2: 0.348593
[599]	valid_0's l2: 0.348557
[600]	valid_0's l2: 0.348562
[601]	valid_0's l2: 0.348556
[602]	valid_0's l2: 0.348553
[603]	valid_0's

[852]	valid_0's l2: 0.34717
[853]	valid_0's l2: 0.347188
[854]	valid_0's l2: 0.347178
[855]	valid_0's l2: 0.34718
[856]	valid_0's l2: 0.347178
[857]	valid_0's l2: 0.347161
[858]	valid_0's l2: 0.347154
[859]	valid_0's l2: 0.347163
[860]	valid_0's l2: 0.347136
[861]	valid_0's l2: 0.347129
[862]	valid_0's l2: 0.347143
[863]	valid_0's l2: 0.347141
[864]	valid_0's l2: 0.347137
[865]	valid_0's l2: 0.347141
[866]	valid_0's l2: 0.347149
[867]	valid_0's l2: 0.347135
[868]	valid_0's l2: 0.34714
[869]	valid_0's l2: 0.347134
[870]	valid_0's l2: 0.347134
[871]	valid_0's l2: 0.347102
[872]	valid_0's l2: 0.347092
[873]	valid_0's l2: 0.347091
[874]	valid_0's l2: 0.347106
[875]	valid_0's l2: 0.347113
[876]	valid_0's l2: 0.347107
[877]	valid_0's l2: 0.347085
[878]	valid_0's l2: 0.347081
[879]	valid_0's l2: 0.347067
[880]	valid_0's l2: 0.347058
[881]	valid_0's l2: 0.347065
[882]	valid_0's l2: 0.34707
[883]	valid_0's l2: 0.347055
[884]	valid_0's l2: 0.34706
[885]	valid_0's l2: 0.347059
[886]	valid_0's l2:

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.01, max_depth=12,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=3000, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [33]:
xg = xgb.XGBRegressor(boosting_type='gbtree', learning_rate=0.01, n_estimators=3000, max_depth=12)
eval_set = [(x_test_tfv, y_test1)]
xg.fit(x_train_tfv, y_train1, eval_set=eval_set, early_stopping_rounds=100)

[0]	validation_0-rmse:0.68175
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.68026
[2]	validation_0-rmse:0.67882
[3]	validation_0-rmse:0.67739
[4]	validation_0-rmse:0.67598
[5]	validation_0-rmse:0.67459
[6]	validation_0-rmse:0.67327
[7]	validation_0-rmse:0.67197
[8]	validation_0-rmse:0.67071
[9]	validation_0-rmse:0.66940
[10]	validation_0-rmse:0.66820
[11]	validation_0-rmse:0.66699
[12]	validation_0-rmse:0.66584
[13]	validation_0-rmse:0.66464
[14]	validation_0-rmse:0.66351
[15]	validation_0-rmse:0.66237
[16]	validation_0-rmse:0.66128
[17]	validation_0-rmse:0.66023
[18]	validation_0-rmse:0.65911
[19]	validation_0-rmse:0.65809
[20]	validation_0-rmse:0.65703
[21]	validation_0-rmse:0.65599
[22]	validation_0-rmse:0.65500
[23]	validation_0-rmse:0.65401
[24]	validation_0-rmse:0.65302
[25]	validation_0-rmse:0.65207
[26]	validation_0-rmse:0.65115
[27]	validation_0-rmse:0.65025
[28]	validation_0-rmse:0.64933
[29]	validation_0-rmse:0.64844
[30]	validatio

[258]	validation_0-rmse:0.59555
[259]	validation_0-rmse:0.59555
[260]	validation_0-rmse:0.59552
[261]	validation_0-rmse:0.59547
[262]	validation_0-rmse:0.59542
[263]	validation_0-rmse:0.59537
[264]	validation_0-rmse:0.59533
[265]	validation_0-rmse:0.59530
[266]	validation_0-rmse:0.59528
[267]	validation_0-rmse:0.59520
[268]	validation_0-rmse:0.59515
[269]	validation_0-rmse:0.59508
[270]	validation_0-rmse:0.59505
[271]	validation_0-rmse:0.59505
[272]	validation_0-rmse:0.59495
[273]	validation_0-rmse:0.59492
[274]	validation_0-rmse:0.59487
[275]	validation_0-rmse:0.59484
[276]	validation_0-rmse:0.59486
[277]	validation_0-rmse:0.59482
[278]	validation_0-rmse:0.59474
[279]	validation_0-rmse:0.59472
[280]	validation_0-rmse:0.59468
[281]	validation_0-rmse:0.59467
[282]	validation_0-rmse:0.59460
[283]	validation_0-rmse:0.59459
[284]	validation_0-rmse:0.59454
[285]	validation_0-rmse:0.59450
[286]	validation_0-rmse:0.59451
[287]	validation_0-rmse:0.59448
[288]	validation_0-rmse:0.59445
[289]	va

[515]	validation_0-rmse:0.58921
[516]	validation_0-rmse:0.58922
[517]	validation_0-rmse:0.58923
[518]	validation_0-rmse:0.58921
[519]	validation_0-rmse:0.58917
[520]	validation_0-rmse:0.58913
[521]	validation_0-rmse:0.58914
[522]	validation_0-rmse:0.58912
[523]	validation_0-rmse:0.58913
[524]	validation_0-rmse:0.58909
[525]	validation_0-rmse:0.58909
[526]	validation_0-rmse:0.58910
[527]	validation_0-rmse:0.58908
[528]	validation_0-rmse:0.58905
[529]	validation_0-rmse:0.58905
[530]	validation_0-rmse:0.58902
[531]	validation_0-rmse:0.58898
[532]	validation_0-rmse:0.58896
[533]	validation_0-rmse:0.58895
[534]	validation_0-rmse:0.58892
[535]	validation_0-rmse:0.58889
[536]	validation_0-rmse:0.58888
[537]	validation_0-rmse:0.58888
[538]	validation_0-rmse:0.58887
[539]	validation_0-rmse:0.58883
[540]	validation_0-rmse:0.58879
[541]	validation_0-rmse:0.58878
[542]	validation_0-rmse:0.58879
[543]	validation_0-rmse:0.58878
[544]	validation_0-rmse:0.58881
[545]	validation_0-rmse:0.58882
[546]	va

[772]	validation_0-rmse:0.58638
[773]	validation_0-rmse:0.58640
[774]	validation_0-rmse:0.58640
[775]	validation_0-rmse:0.58637
[776]	validation_0-rmse:0.58636
[777]	validation_0-rmse:0.58636
[778]	validation_0-rmse:0.58635
[779]	validation_0-rmse:0.58634
[780]	validation_0-rmse:0.58633
[781]	validation_0-rmse:0.58634
[782]	validation_0-rmse:0.58634
[783]	validation_0-rmse:0.58632
[784]	validation_0-rmse:0.58631
[785]	validation_0-rmse:0.58632
[786]	validation_0-rmse:0.58632
[787]	validation_0-rmse:0.58632
[788]	validation_0-rmse:0.58630
[789]	validation_0-rmse:0.58631
[790]	validation_0-rmse:0.58633
[791]	validation_0-rmse:0.58630
[792]	validation_0-rmse:0.58629
[793]	validation_0-rmse:0.58626
[794]	validation_0-rmse:0.58626
[795]	validation_0-rmse:0.58625
[796]	validation_0-rmse:0.58627
[797]	validation_0-rmse:0.58627
[798]	validation_0-rmse:0.58627
[799]	validation_0-rmse:0.58626
[800]	validation_0-rmse:0.58625
[801]	validation_0-rmse:0.58622
[802]	validation_0-rmse:0.58621
[803]	va

[1028]	validation_0-rmse:0.58493
[1029]	validation_0-rmse:0.58495
[1030]	validation_0-rmse:0.58495
[1031]	validation_0-rmse:0.58493
[1032]	validation_0-rmse:0.58494
[1033]	validation_0-rmse:0.58495
[1034]	validation_0-rmse:0.58493
[1035]	validation_0-rmse:0.58489
[1036]	validation_0-rmse:0.58491
[1037]	validation_0-rmse:0.58489
[1038]	validation_0-rmse:0.58488
[1039]	validation_0-rmse:0.58487
[1040]	validation_0-rmse:0.58486
[1041]	validation_0-rmse:0.58486
[1042]	validation_0-rmse:0.58489
[1043]	validation_0-rmse:0.58490
[1044]	validation_0-rmse:0.58489
[1045]	validation_0-rmse:0.58489
[1046]	validation_0-rmse:0.58486
[1047]	validation_0-rmse:0.58484
[1048]	validation_0-rmse:0.58487
[1049]	validation_0-rmse:0.58488
[1050]	validation_0-rmse:0.58490
[1051]	validation_0-rmse:0.58488
[1052]	validation_0-rmse:0.58490
[1053]	validation_0-rmse:0.58490
[1054]	validation_0-rmse:0.58489
[1055]	validation_0-rmse:0.58488
[1056]	validation_0-rmse:0.58490
[1057]	validation_0-rmse:0.58490
[1058]	val

XGBRegressor(base_score=0.5, booster=None, boosting_type='gbtree',
       colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
       gamma=0, gpu_id=-1, importance_type='gain',
       interaction_constraints=None, learning_rate=0.01, max_delta_step=0,
       max_depth=12, min_child_weight=1, missing=nan,
       monotone_constraints=None, n_estimators=3000, n_jobs=0,
       num_parallel_tree=1, objective='reg:squarederror', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
       tree_method=None, validate_parameters=False, verbosity=None)

In [26]:
lgb_model = lgb.LGBMRegressor()

parameters = {'learning_rate': [0.01],
              'n_estimators': [3000],
              'max_depth': [12, 15, 17, 18],
              'subsample': [0.8, 0.9, 1.0]}

clf = GridSearchCV(lgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(n_splits=5, shuffle=True), 
                   scoring='neg_mean_squared_error',
                   verbose=2, refit=True)

In [27]:
clf.fit(x_train_tfv, y_train1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed: 19.0min
[Parallel(n_jobs=5)]: Done  60 out of  60 | elapsed: 36.5min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'learning_rate': [0.01], 'n_estimators': [3000], 'max_depth': [12, 15, 17, 18], 'subsample': [0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=2)

In [28]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Neg MSE score', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))



Neg MSE score -0.3451703732724872
learning_rate: 0.01
max_depth: 12
n_estimators: 3000
subsample: 0.8


In [32]:
predictions = lg.predict(x_test_tfv)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test1, predictions) ** 0.5

0.5886906949410237

In [34]:
predictions = xg.predict(x_test_tfv)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test1, predictions) ** 0.5

0.58460953489757

In [41]:
test_enc = tfv.transform(x_test)
pred1 = lg.predict(test_enc)
submissions.label = pred1
submissions.to_csv('NLPtestLG.csv', index = False)

In [37]:
test_enc = tfv.transform(x_test)
pred = xg.predict(test_enc)
submissions.label = pred
submissions.to_csv('NLPtestXG.csv', index = False)

In [39]:
avg = (pred + pred1)/2

In [40]:
submissions.label = avg
submissions.to_csv('NLPtestXG___LG.csv', index = False)

In [137]:
approx = []
for i in pred:
    if i > 0.5 and i < 1.5:
        i = 1
        approx.append(i)
    elif i < 0.5 and i > -0.5:
        i = 0
        approx.append(i)
    elif i < -0.5 and i< -1.5:
        i = -1
        approx.append(i)

In [144]:
approx.count(-1)

0

In [159]:
from sklearn.model_selection import KFold, StratifiedKFold

n_fold = 10
fold = StratifiedKFold(n_fold, shuffle=True, random_state=42)

avg = 0
test_oofs = []

for i, (tr,vr) in enumerate(fold.split(x_train_tfv, y_train1)):
    X,Y = x_train_tfv[tr], np.take(y_train1, tr, axis=0)
    x,y = x_train_tfv[vr], np.take(y_train1, vr, axis=0)
    
    
    lg = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.01, n_estimators=3000, max_depth=50)
    eval_set = [(x_test_tfv, y_test1)]
    #lg.fit(x_train_tfv, y_train1, eval_set=eval_set, early_stopping_rounds=50)
    lg.fit(X, Y, eval_set=eval_set, early_stopping_rounds=50)
    test_pred = lg.predict(x_test_tfv)
    pred = lg.predict(x)
    pred = process_prediction(pred)
    
    test_oofs.append(test_pred)
    
    score = rmse(y-1, pred)
    avg += score
    print(f"Fold {i}: ", score)

print("Avg score : {:.3f}".format(avg/n_fold))

[1]	valid_0's l2: 0.42486
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l2: 0.423502
[3]	valid_0's l2: 0.422173
[4]	valid_0's l2: 0.42086
[5]	valid_0's l2: 0.419585
[6]	valid_0's l2: 0.418335
[7]	valid_0's l2: 0.417114
[8]	valid_0's l2: 0.415917
[9]	valid_0's l2: 0.414748
[10]	valid_0's l2: 0.413588
[11]	valid_0's l2: 0.412483
[12]	valid_0's l2: 0.411373
[13]	valid_0's l2: 0.410367
[14]	valid_0's l2: 0.409344
[15]	valid_0's l2: 0.408292
[16]	valid_0's l2: 0.407305
[17]	valid_0's l2: 0.406316
[18]	valid_0's l2: 0.405396
[19]	valid_0's l2: 0.404402
[20]	valid_0's l2: 0.403465
[21]	valid_0's l2: 0.402509
[22]	valid_0's l2: 0.401619
[23]	valid_0's l2: 0.400744
[24]	valid_0's l2: 0.399928
[25]	valid_0's l2: 0.399098
[26]	valid_0's l2: 0.398281
[27]	valid_0's l2: 0.397455
[28]	valid_0's l2: 0.396689
[29]	valid_0's l2: 0.395854
[30]	valid_0's l2: 0.39512
[31]	valid_0's l2: 0.394366
[32]	valid_0's l2: 0.393635
[33]	valid_0's l2: 0.392912
[34]	valid_0's l2: 0.39213


[569]	valid_0's l2: 0.346008
[570]	valid_0's l2: 0.346047
[571]	valid_0's l2: 0.345998
[572]	valid_0's l2: 0.346028
[573]	valid_0's l2: 0.34604
[574]	valid_0's l2: 0.346032
[575]	valid_0's l2: 0.34603
[576]	valid_0's l2: 0.346042
[577]	valid_0's l2: 0.346059
[578]	valid_0's l2: 0.346057
[579]	valid_0's l2: 0.346099
[580]	valid_0's l2: 0.346118
[581]	valid_0's l2: 0.346156
[582]	valid_0's l2: 0.34619
[583]	valid_0's l2: 0.346208
[584]	valid_0's l2: 0.3462
[585]	valid_0's l2: 0.346178
[586]	valid_0's l2: 0.346167
[587]	valid_0's l2: 0.346179
[588]	valid_0's l2: 0.346179
[589]	valid_0's l2: 0.346202
[590]	valid_0's l2: 0.346195
[591]	valid_0's l2: 0.346229
[592]	valid_0's l2: 0.346242
[593]	valid_0's l2: 0.34624
[594]	valid_0's l2: 0.34623
[595]	valid_0's l2: 0.346204
[596]	valid_0's l2: 0.346196
[597]	valid_0's l2: 0.346211
[598]	valid_0's l2: 0.34622
[599]	valid_0's l2: 0.346224
[600]	valid_0's l2: 0.346259
[601]	valid_0's l2: 0.346261
[602]	valid_0's l2: 0.346238
[603]	valid_0's l2: 0.

IndexError: invalid index to scalar variable.

In [158]:
def process_prediction(preds):
    final_preds = []
    for pred in preds:
        argmax = pred.argmax()
        if argmax == 0:
            final_preds.append(-1*(pred[0]))
        elif argmax == 1:
            final_preds.append(0)
        else:
            final_preds.append(pred[2])
    
    return final_preds


def rmse(true, pred):
    return np.sqrt(mean_squared_error(true, pred))

In [112]:
x_train5.shape

(4000, 2)

In [117]:
train1 = train.drop(['tweet_id', 'label'], axis=1)

In [118]:
train1.head()

Unnamed: 0,safe_text,agreement
0,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,1.0
1,I'm 100% thinking of devoting my career to pro...,1.0
2,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",1.0
3,I mean if they immunize my kid with something ...,1.0
4,Thanks to <user> Catch me performing at La Nui...,1.0


In [120]:
train_array = np.array(train1)

In [122]:
train_array[:5]

array([['Me &amp; The Big Homie meanboy3000 #MEANBOY #MB #MBS #MMR #STEGMANLIFE @ Stegman St. <url>',
        1.0],
       ["I'm 100% thinking of devoting my career to proving autism isn't caused by vaccines due to the IDIOTIC posts I've seen about World Autism Day",
        1.0],
       ['#whatcausesautism VACCINES, DO NOT VACCINATE YOUR CHILD', 1.0],
       ["I mean if they immunize my kid with something that won't secretly kill him years down the line then I'm all for it, but I don't trust that",
        1.0],
       ['Thanks to <user> Catch me performing at La Nuit NYC 1134 1st ave. Show starts at 6! #jennifair #mmr… <url>',
        1.0]], dtype=object)

In [170]:
first = pd.read_csv('NLPtestXG.csv')
second = pd.read_csv('NLPtest.csv')
third = pd.read_csv('NLPtestRF.csv')

avg = (first.label + third.label)/2
submissions.label = avg
submissions.to_csv('average_lg_rf.csv', index = False)

In [50]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=400, max_depth=20, max_features='auto',
                            min_samples_leaf=1, min_samples_split=4, random_state=0)
rf.fit(x_train_tfv, y_train1)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [51]:
predictions = rf.predict(x_test_tfv)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test1, predictions) ** 0.5

0.599617380076707

In [52]:
test_enc = tfv.transform(x_test)
pred = rf.predict(test_enc)
submissions.label = pred
submissions.to_csv('NLPtestRF.csv', index = False)

In [171]:
cv_acc = cross_val_score(rf, x_train_tfv, y_train1, scoring='neg_mean_absolute_error')
cv_acc

array([-0.42472312, -0.42106128, -0.42286713])

In [172]:
cv_acc.mean()

-0.42288384353707914

In [47]:
rf = RandomForestRegressor()

parameters = {'max_depth': [12, 15, 17, 18],
              'n_estimators': [300, 400, 500],
              'min_samples_split': [2, 3, 4]}

clf = GridSearchCV(rf, parameters, n_jobs=5, 
                   cv=StratifiedKFold(n_splits=5, shuffle=True), 
                   scoring='neg_mean_squared_error',
                   verbose=10, refit=True)

In [46]:
rf = RandomForestRegressor()
rf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [48]:
clf.fit(x_train_tfv, y_train1)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:  1.6min
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  3.7min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  6.4min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  9.8min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed: 13.6min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 15.7min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed: 22.1min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed: 31.7min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed: 38.1min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed: 46.6min
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed: 57.3min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed: 66.1min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed: 76.0min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 89.1min
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed: 101.2min
[Parallel(n_jobs=5)]: Done 180 out of 180 | elapsed: 107.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'max_depth': [12, 15, 17, 18], 'n_estimators': [300, 400, 500], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=10)

In [49]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Neg MSE score', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))



Neg MSE score -0.34681892529146524
max_depth: 18
min_samples_split: 4
n_estimators: 400
