In [368]:
import numpy as np
from sklearn import datasets
from collections import Counter
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import completeness_score
from sklearn.feature_selection import SelectPercentile, mutual_info_classif
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression

In [90]:
train_emb_df = pd.read_pickle("./train_emb.pkl")  

In [91]:
train_emb_df.columns[train_emb_df.isna().any()]  #check NAN value

Index([], dtype='object')

In [92]:
train_emb_df = train_emb_df.rename(columns={'TFIDF': 'emb'})
train_emb_features = pd.DataFrame(train_emb_df['emb'].tolist())
train_emb_labels = train_emb_df['Sentiment']

In [95]:
dev_emb_df = pd.read_pickle("./dev_emb.pkl")  #dev_emb build

In [96]:
dev_emb_df.columns[dev_emb_df.isna().any()]

Index([], dtype='object')

In [97]:
dev_emb_df = dev_emb_df.rename(columns={'TFIDF': 'emb'})
dev_emb_features = pd.DataFrame(dev_emb_df['emb'].tolist())
dev_emb_labels = dev_emb_df['Sentiment']

In [149]:
#test emb build
test_emb_df = pd.read_pickle("./test_emb.pkl")
test_emb_df = test_emb_df.rename(columns={'TFIDF': 'emb'})
test_emb_features = pd.DataFrame(test_emb_df['emb'].tolist())

In [135]:
# train tfidf build
train_tfidf_df = pd.read_pickle("./train_tfidf.pkl")
train_tfidf_df.columns[train_tfidf_df.isna().any()]
train_tfidf_features = pd.DataFrame(train_tfidf_df['TFIDF'].tolist())
train_tfidf_labels = train_tfidf_df['Sentiment']

In [136]:
# dev tfidf build
dev_tfidf_df = pd.read_pickle("./dev_tfidf.pkl")
dev_tfidf_df.columns[train_tfidf_df.isna().any()]
dev_tfidf_features = pd.DataFrame(dev_tfidf_df['TFIDF'].tolist())
dev_tfidf_labels = dev_emb_df['Sentiment']

In [102]:
zeroR = DummyClassifier(strategy="most_frequent")   #zeroR baseline for emb
zeroR.fit(train_emb_features, train_emb_labels)

DummyClassifier(strategy='most_frequent')

In [103]:
zeroR.score(dev_emb_features, dev_emb_labels)

0.5

In [174]:
#zeroR baseline for tfidf
zeroR.fit(train_tfidf_features, train_tfidf_labels)
zeroR.score(dev_tfidf_features, dev_tfidf_labels)

0.5

In [424]:
#Gaussian naive bayes for emb
gnb = GaussianNB()
emb_gnb_pred = gnb.fit(train_emb_features, train_emb_labels).predict(dev_emb_features)
print("Number of mislabeled points out of a total %d points : %d" % (dev_emb_features.shape[0], (dev_emb_labels != emb_gnb_pred ).sum()))
gnb_emb=gnb.score(dev_emb_features,dev_emb_labels)

gnb_emb

Number of mislabeled points out of a total 4000 points : 1541


0.61475

In [247]:
#Gaussian naive bayes for tfidf
tfidf_gnb_pred = gnb.fit(train_tfidf_features, train_tfidf_labels).predict(dev_tfidf_features)
print("Number of mislabeled points out of a total %d points : %d" % (dev_tfidf_features.shape[0], (dev_tfidf_labels != tfidf_gnb_pred).sum()))
gnb_tfidf=gnb.score(dev_tfidf_features,dev_tfidf_labels)
gnb_tfidf

Number of mislabeled points out of a total 4000 points : 1419


0.64525

In [422]:
#Logistic regression for emb
emb_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(train_emb_features, train_emb_labels)
emb_lr_rpred = emb_lr_pred.predict(dev_emb_features)
lr_emb=emb_lr_pred.score(dev_emb_features, dev_emb_labels)
lr_emb

0.69825

In [298]:
#Logistic regression for tfidf
tfidf_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(train_tfidf_features, train_tfidf_labels)
tfidf_lr_rpred = tfidf_lr_pred.predict(dev_tfidf_features)
lr_tfidf=tfidf_lr_pred.score(dev_tfidf_features, dev_tfidf_labels)
lr_tfidf

0.67875

In [124]:
dev_emb_labels

0       negative
1       positive
2       negative
3       negative
4       positive
          ...   
3995    positive
3996    positive
3997    positive
3998    positive
3999    negative
Name: Sentiment, Length: 4000, dtype: object

In [131]:
# transform the label to 0,1, so we can use for calculating accuracy
zeroOneDev_emb_labels = dev_emb_labels.replace(to_replace = ['negative', 'positive'], value = [0, 1])

In [137]:
zeroOneDev_tfidf_labels = dev_tfidf_labels.replace(to_replace = ['negative', 'positive'], value = [0, 1])

In [309]:
#cluster kmeans for emb
kmeans_emb = KMeans(n_clusters=2, random_state=0).fit(dev_emb_features)
#print(kmeans.labels_[:4])
#print(zeroOneDev_emb_labels.tolist()[:4])
completeness_score(kmeans_emb.labels_ , zeroOneDev_emb_labels.tolist())

0.021060259873367557

In [288]:
#cluster kmeans for tfidf
kmeans_tfidf = KMeans(n_clusters=2, random_state=0).fit(dev_tfidf_features)
completeness_score(kmeans_tfidf.labels_ , zeroOneDev_tfidf_labels.tolist())

0.020629560978054903

In [440]:
#Multi layer neural network
#clf = MLPClassifier(random_state=1, hidden_layer_sizes=(150,100,50), max_iter=300).fit(train_emb_features, train_emb_labels)

mlp_gs = MLPClassifier(max_iter=500)

#parameter_space = {
    
       #     'activation' : ['identity', 'logistic', 'tanh', 'relu'],
       #     'solver' : ['lbfgs', 'sgd', 'adam'],
      #      'hidden_layer_sizes': [
        #     (1,),(2,),(3,),(4,),(5,)
        #     ]
        
#}
parameter_space = {
    
            'activation' : [ 'tanh'],
            'solver' : [ 'sgd'],
            'hidden_layer_sizes': [
             (4,4,4,),(4,4,4,1,),(4,4,4,2,),(4,4,4,4,),(4,4,4,5,),(4,4,4,7,),(4,4,4,9,)
             ]
        
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(train_emb_features, train_emb_labels)
print("Best parameters set found on development set:")
print(clf.best_params_)

#emb_MNN_rpred = clf.predict(dev_emb_features)


#clf.score(dev_emb_features, dev_emb_labels)



Best parameters set found on development set:
{'activation': 'tanh', 'hidden_layer_sizes': (4, 4, 4), 'solver': 'sgd'}


In [444]:
#Multi layer neural network
mlp = MLPClassifier(random_state=1, activation='tanh', solver='sgd', hidden_layer_sizes=(4,4,4,), max_iter=500).fit(train_emb_features, train_emb_labels)





emb_MNN_rpred = mlp.predict(dev_emb_features)


mlp.score(dev_emb_features, dev_emb_labels)

0.69975

In [445]:
#stacking classifier
estimators = [
('gnb', GaussianNB()),
    ('rl',LogisticRegression(max_iter=1000,random_state=0)),
    ('mlp',mlp)
           
 ]
stackclf = StackingClassifier(
 estimators=estimators, final_estimator=LogisticRegression())

stackmodel=stackclf.fit(train_emb_features, train_emb_labels)
emb_stack_rpred = stackmodel.predict(dev_emb_features)


stackmodel.score(dev_emb_features, dev_emb_labels)



0.6995

In [446]:
#sub-summary
print("ZeroR accuracy score for emb: "+str(zeroR.score(dev_emb_features, dev_emb_labels)))
print("ZeroR accuracy score for tfidf: "+str(zeroR.score(dev_tfidf_features, dev_tfidf_labels)))
print('\n')
print("Gaussian naive bayes accuracy score for emb: "+str(gnb_emb))
print('pricision: '+str(precision_score(emb_gnb_pred, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_gnb_pred, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_gnb_pred, dev_emb_labels, average='weighted')))
print('\n')
print("Gaussian naive bayes accuracy score for tfidf: "+str(gnb_tfidf))
print('pricision: '+str(precision_score(tfidf_gnb_pred, dev_tfidf_labels, average='macro')))
print('recall: '+str(recall_score(tfidf_gnb_pred, dev_tfidf_labels, average='macro')))
print("F1:"+str(f1_score(tfidf_gnb_pred, dev_tfidf_labels, average='weighted')))
print('\n')

print("Logistic regression accuracy score for emb: "+str(lr_emb))
print('pricision: '+str(precision_score(emb_lr_rpred, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_lr_rpred, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_lr_rpred, dev_emb_labels, average='weighted')))
print('\n')
print("Logistic regression accuracy score for tfidf: "+str(lr_tfidf))
print('pricision: '+str(precision_score(tfidf_lr_rpred, dev_tfidf_labels, average='macro')))
print('recall: '+str(recall_score(tfidf_lr_rpred, dev_tfidf_labels, average='macro')))
print("F1:"+str(f1_score(tfidf_lr_rpred, dev_tfidf_labels, average='weighted')))
print('\n')
print("cluster kmeans accuracy score for emb: "+str(completeness_score(kmeans_emb.labels_ , zeroOneDev_emb_labels.tolist())))
print('pricision: '+str(precision_score(kmeans_emb.labels_, zeroOneDev_emb_labels.tolist(), average='macro')))
print('recall: '+str(recall_score(kmeans_emb.labels_, zeroOneDev_emb_labels.tolist(), average='macro')))
print("F1:"+str(f1_score(kmeans_emb.labels_, zeroOneDev_emb_labels.tolist(), average='weighted')))
print('\n')
print("cluster kmeans accuracy score for tfidf: "+str(completeness_score(kmeans_tfidf.labels_ , zeroOneDev_tfidf_labels.tolist())))
print('pricision: '+str(precision_score(kmeans_tfidf.labels_, zeroOneDev_tfidf_labels.tolist(), average='macro')))
print('recall: '+str(recall_score(kmeans_tfidf.labels_, zeroOneDev_tfidf_labels.tolist(), average='macro')))
print("F1:"+str(f1_score(kmeans_tfidf.labels_, zeroOneDev_tfidf_labels.tolist(), average='weighted')))
print('\n')
print("multi layer neural perceptron accuracy score for emb: "+str(mlp.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(emb_MNN_rpred, dev_tfidf_labels, average='macro')))
print('recall: '+str(recall_score(emb_MNN_rpred, dev_tfidf_labels, average='macro')))
print("F1:"+str(f1_score(emb_MNN_rpred, dev_tfidf_labels, average='weighted')))
print('\n')
print("stacking gaussian NB and logistic Regression accuracy score for emb: "+str(stackmodel.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(emb_stack_rpred, dev_tfidf_labels, average='macro')))
print('recall: '+str(recall_score(emb_stack_rpred, dev_tfidf_labels, average='macro')))
print("F1:"+str(f1_score(emb_stack_rpred, dev_tfidf_labels, average='weighted')))
print('\n')

ZeroR accuracy score for emb: 0.5
ZeroR accuracy score for tfidf: 0.5


Gaussian naive bayes accuracy score for emb: 0.61475
pricision: 0.6147499999999999
recall: 0.6181032160354835
F1:0.6175040771901504


Gaussian naive bayes accuracy score for tfidf: 0.64525
pricision: 0.64525
recall: 0.6452660155782175
F1:0.645259778066383


Logistic regression accuracy score for emb: 0.69825
pricision: 0.69825
recall: 0.6985442922772279
F1:0.6983618586850089


Logistic regression accuracy score for tfidf: 0.67875
pricision: 0.67875
recall: 0.6787825831257748
F1:0.6787646376200517


cluster kmeans accuracy score for emb: 0.021060259873367557
pricision: 0.578
recall: 0.5927689019611109
F1:0.5954918869745591


cluster kmeans accuracy score for tfidf: 0.020629560978054903
pricision: 0.572
recall: 0.5981268671362219
F1:0.6025209912623897


multi layer neural perceptron accuracy score for emb: 0.69975
pricision: 0.6997500000000001
recall: 0.70158853786244
F1:0.7004361532378921


stacking gaussian NB and 

In [189]:
#feature selection
#select top 25% features
selector25 = SelectPercentile(mutual_info_classif, percentile=25)
train_emb_f_reduced25 = selector25.fit_transform(train_emb_features, train_emb_labels)
train_emb_f_reduced25.shape

(40000, 96)

In [190]:
#select top 50% features
selector50 = SelectPercentile(mutual_info_classif, percentile=50)
train_emb_f_reduced50 = selector50.fit_transform(train_emb_features, train_emb_labels)


In [191]:
#select top 75% features
selector75 = SelectPercentile(mutual_info_classif, percentile=75)
train_emb_f_reduced75 = selector75.fit_transform(train_emb_features, train_emb_labels)


In [323]:
#Logistic regression for emb after featureSelection top 25%
emb_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(train_emb_f_reduced25, train_emb_labels)
dev_emb_features_reduced25 = selector25.transform(dev_emb_features)
lr_emb_25=emb_lr_pred.score(dev_emb_features_reduced25, dev_emb_labels)
emb_lr_rpred25 = emb_lr_pred.predict(dev_emb_features_reduced25)

lr_emb_25

0.68425

In [324]:
#Logistic regression for emb after featureSelection top 50%
emb_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(train_emb_f_reduced50, train_emb_labels)
dev_emb_features_reduced50 = selector50.transform(dev_emb_features)
lr_emb_50=emb_lr_pred.score(dev_emb_features_reduced50, dev_emb_labels)
emb_lr_rpred50 = emb_lr_pred.predict(dev_emb_features_reduced50)
lr_emb_50

0.688

In [325]:
#Logistic regression for emb after featureSelection top 75%
emb_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(train_emb_f_reduced75, train_emb_labels)
dev_emb_features_reduced75 = selector75.transform(dev_emb_features)
lr_emb_75=emb_lr_pred.score(dev_emb_features_reduced75, dev_emb_labels)
emb_lr_rpred75 = emb_lr_pred.predict(dev_emb_features_reduced75)
lr_emb_75

0.693

In [358]:
#sub summary for feature selection
print('Logistic regression for emb after featureSelection top 25% accuracy score:'+ str(lr_emb_25))
print('pricision: '+str(precision_score(emb_lr_rpred25, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_lr_rpred25, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_lr_rpred25, dev_emb_labels, average='weighted')))
print('\n')
print('Logistic regression for emb after featureSelection top 50% accuracy score:'+ str(lr_emb_50))
print('pricision: '+str(precision_score(emb_lr_rpred50, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_lr_rpred50, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_lr_rpred50, dev_emb_labels, average='weighted')))
print('\n')
print('Logistic regression for emb after featureSelection top 75% accuracy score:'+ str(lr_emb_75))
print('pricision: '+str(precision_score(emb_lr_rpred75, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_lr_rpred75, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_lr_rpred75, dev_emb_labels, average='weighted')))
print('\n')

Logistic regression for emb after featureSelection top 25% accuracy score:0.68425
pricision: 0.68425
recall: 0.6846666541383998
F1:0.6844282032521469


Logistic regression for emb after featureSelection top 50% accuracy score:0.688
pricision: 0.688
recall: 0.6882718645724426
F1:0.6881126726748354


Logistic regression for emb after featureSelection top 75% accuracy score:0.693
pricision: 0.6930000000000001
recall: 0.6934837092731829
F1:0.6931919949968731




In [280]:
#boosting
clf = GradientBoostingClassifier(n_estimators=5000, learning_rate=0.05,
max_depth=3, random_state=0, n_iter_no_change=20,max_features='log2').fit(train_emb_features, train_emb_labels)
clf.score(dev_emb_features, dev_emb_labels)

0.69575

In [216]:
#semi-supervise --combine features
unlabeled_emb_df = pd.read_pickle("./unlabeled_emb.pkl")  
ublabeled_emb_features = pd.DataFrame(unlabeled_emb_df['TFIDF'].tolist())
ublabeled_emb_labels = pd.DataFrame(unlabeled_emb_df['Sentiment'].tolist())

combinef = pd.concat([train_emb_features,ublabeled_emb_features])
combinef.reset_index(drop=True,inplace=True)
combinef

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.141736,-0.544553,-0.185595,-0.226111,-0.258681,-0.707513,0.827305,-0.347276,0.306913,-0.178794,...,0.117562,-0.401929,0.283496,-0.023253,-0.071337,0.319748,0.582268,-0.480418,-0.091490,-0.270181
1,-0.159905,-0.274271,0.193665,-0.062508,0.319862,-0.233696,0.547739,-0.243858,0.152707,0.101391,...,-0.016827,-0.492055,0.451817,0.152209,-0.255176,-0.003595,0.271348,-0.077961,-0.140713,-0.099328
2,-0.184300,-0.820776,-0.198533,-0.126704,-0.218556,-0.581000,0.685292,0.000823,0.169189,0.041687,...,-0.235139,-0.354844,0.275892,-0.079320,-0.172949,0.141234,0.488196,-0.487415,0.268620,-0.263073
3,-0.118453,0.008326,0.502737,-0.263399,-0.555033,-0.100746,1.178759,0.401233,0.298557,0.144832,...,-0.038454,0.144181,0.059292,0.184386,-0.064873,0.448778,0.472825,-0.058554,-0.128914,-0.508223
4,-0.408979,0.232619,-0.017716,-0.248017,-0.224253,-0.033788,0.821358,0.391244,0.174423,-0.277722,...,0.441039,-0.675858,0.317422,0.092662,0.196140,0.441943,0.580520,0.128066,0.213399,-0.144494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139995,0.007890,0.073103,-0.280805,-0.335471,-0.175296,0.156183,1.132548,0.301846,0.038817,-0.126461,...,0.544730,-0.268315,-0.076898,-0.014175,-0.691903,0.097362,0.072452,0.046252,-0.308882,-0.567449
139996,-0.134720,-0.154996,-0.282151,-0.134100,-0.227636,-0.207565,0.390177,-0.184039,-0.075010,-0.255664,...,0.433176,-0.648247,0.116928,0.025290,-0.006734,0.253278,0.150709,-0.450959,-0.343677,0.168129
139997,-0.526653,-0.436127,-0.526713,-0.224558,0.187997,-0.368143,0.527435,0.058443,0.169143,-0.083301,...,-0.277252,-0.384371,0.128579,0.161634,0.192887,0.016388,0.309959,-0.494707,0.689926,-0.153628
139998,-0.293092,-0.206648,-0.210050,-0.214401,0.000266,0.173398,0.184987,0.299076,0.236207,-0.453665,...,0.493244,-0.332592,0.038494,0.451940,0.044022,0.272157,-0.041516,0.038367,-0.095991,-0.050408


In [223]:
#combine label and unlabel
ublabeled_emb_labels=ublabeled_emb_labels.fillna(-1)  #semi sklearn need unlabled to be -1
ublabeled_emb_labels
combinel =pd.concat([train_emb_labels,ublabeled_emb_labels]) 
combinel.reset_index(drop=True,inplace=True)
combinel.shape


(140000, 1)

In [330]:
#semi model with logestic regression build & predict w threshold 0.75
lr = LogisticRegression(max_iter=2000,random_state=0)
semi_lr_model = SelfTrainingClassifier(lr)
semi_lr_model.fit(combinef, combinel.values.ravel())
emb_semilr_rpred75 = semi_lr_model.predict(dev_emb_features)

In [311]:
semi_lr_model.score(dev_emb_features, dev_emb_labels)

0.70275

In [233]:
pd.DataFrame(semi_lr_model.transduction_).value_counts()

positive    57829
negative    52543
-1          29628
dtype: int64

In [234]:
#semi model with logestic regression build & predict w threshold 0.9
self_training_model9 = SelfTrainingClassifier(lr,threshold=0.9)
self_training_model9.fit(combinef, combinel.values.ravel())


SelfTrainingClassifier(base_estimator=LogisticRegression(max_iter=1000,
                                                         random_state=0),
                       threshold=0.9)

In [333]:
emb_semilr_rpred90= self_training_model9.predict(dev_emb_features)

In [235]:
self_training_model9.score(dev_emb_features, dev_emb_labels)

0.697

In [236]:
#semi model with logestic regression build & predict w threshold 0.65
self_training_model65 = SelfTrainingClassifier(lr,threshold=0.65)
self_training_model65.fit(combinef, combinel.values.ravel())

SelfTrainingClassifier(base_estimator=LogisticRegression(max_iter=1000,
                                                         random_state=0),
                       threshold=0.65)

In [336]:
emb_semilr_rpred65= self_training_model65.predict(dev_emb_features)

In [237]:
self_training_model65.score(dev_emb_features, dev_emb_labels)

0.7015

In [267]:
#semi model with naive bayes build & predict w threshold 0.75
gnb = GaussianNB()
semi_gnb_model = SelfTrainingClassifier(gnb)
semi_gnb_model.fit(combinef, combinel.values.ravel())

SelfTrainingClassifier(base_estimator=GaussianNB())

In [339]:
semi_gnb_model75pre=semi_gnb_model.predict(dev_emb_features)

In [268]:
semi_gnb_model.score(dev_emb_features, dev_emb_labels)

0.58825

In [269]:
pd.DataFrame(semi_gnb_model.transduction_).value_counts()

negative    78748
positive    61224
-1             28
dtype: int64

In [270]:
#semi model with naive bayes build & predict w threshold 0.85
gnb = GaussianNB()
semi_gnb_model85 = SelfTrainingClassifier(gnb,threshold=0.85)
semi_gnb_model85.fit(combinef, combinel.values.ravel())

SelfTrainingClassifier(base_estimator=GaussianNB(), threshold=0.85)

In [341]:
semi_gnb_model85pre=semi_gnb_model85.predict(dev_emb_features)

In [271]:
semi_gnb_model85.score(dev_emb_features, dev_emb_labels)

0.588

In [272]:
pd.DataFrame(semi_gnb_model85.transduction_).value_counts()

negative    79522
positive    60424
-1             54
dtype: int64

In [442]:
#semi model with multi layer percepcion build & predict w threshold 0.75
model = mlp_gs
semi_MLP_model = SelfTrainingClassifier(model)
semi_MLP_model.fit(combinef, combinel.values.ravel())
semi_MLP_model75pre=semi_MLP_model.predict(dev_emb_features)
semi_MLP_model.score(dev_emb_features, dev_emb_labels)

0.63725

In [344]:
#semi-summary
print('#semi model with logestic regression w threshold 0.75: '+str(semi_lr_model.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(emb_semilr_rpred75, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_semilr_rpred75, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_semilr_rpred75, dev_emb_labels, average='weighted')))
print('\n')
print('#semi model with logestic regression w threshold 0.9: '+str(self_training_model9.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(emb_semilr_rpred90, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_semilr_rpred90, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_semilr_rpred90, dev_emb_labels, average='weighted')))
print('\n')
print('#semi model with logestic regression w threshold 0.65: '+str(self_training_model65.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(emb_semilr_rpred65, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(emb_semilr_rpred65, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(emb_semilr_rpred65, dev_emb_labels, average='weighted')))
print('\n')
print('#semi model with gaussianNB w threshold 0.75: '+str(semi_gnb_model.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(semi_gnb_model75pre, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(semi_gnb_model75pre, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(semi_gnb_model75pre, dev_emb_labels, average='weighted')))
print('\n')
print('#semi model with gaussianNB w threshold 0.85: '+str(semi_gnb_model85.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(semi_gnb_model85pre, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(semi_gnb_model85pre, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(semi_gnb_model85pre, dev_emb_labels, average='weighted')))
print('\n')
print('#semi model with multi layer percepcion w threshold 0.85: '+str(semi_gnb_model85.score(dev_emb_features, dev_emb_labels)))
print('pricision: '+str(precision_score(semi_gnb_model85pre, dev_emb_labels, average='macro')))
print('recall: '+str(recall_score(semi_gnb_model85pre, dev_emb_labels, average='macro')))
print("F1:"+str(f1_score(semi_gnb_model85pre, dev_emb_labels, average='weighted')))
print('\n')



#semi model with logestic regression w threshold 0.75: 0.70275
pricision: 0.70275
recall: 0.7033993014199578
F1:0.7029874135489754


#semi model with logestic regression w threshold 0.9: 0.697
pricision: 0.6970000000000001
recall: 0.6971232020012508
F1:0.6970473511486169


#semi model with logestic regression w threshold 0.65: 0.7015
pricision: 0.7015
recall: 0.7023287385129491
F1:0.7018059773207765


#semi model with gaussianNB w threshold 0.75: 0.58825
pricision: 0.5882499999999999
recall: 0.5982827251544472
F1:0.5990330703640063


#semi model with gaussianNB w threshold 0.85: 0.588
pricision: 0.5880000000000001
recall: 0.5981092781275399
F1:0.5988938513338221




In [382]:
#feature Engineering
train_tweet_df = pd.read_pickle("./train.pkl")
train_tweetdev_df = pd.read_pickle("./dev.pkl")
train_tweet_df['text']
train_tweet = train_tweet_df['text'].tolist()
train_tweetdev = train_tweetdev_df['text'].tolist()

In [383]:
polarity =[]
devpolarity = []
analyser = SentimentIntensityAnalyzer()
for i in train_tweet:
    polarity.append(analyser.polarity_scores(i))
for i in train_tweetdev:
    devpolarity.append(analyser.polarity_scores(i))

In [384]:
polarityf = pd.DataFrame(polarity)
devpolarityf=pd.DataFrame(devpolarity)
polarityf

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.641,0.359,0.4215
1,0.000,1.000,0.000,0.0000
2,0.000,1.000,0.000,0.0000
3,0.000,0.682,0.318,0.6369
4,0.137,0.863,0.000,-0.2263
...,...,...,...,...
39995,0.000,1.000,0.000,0.0000
39996,0.333,0.667,0.000,-0.6115
39997,0.213,0.787,0.000,-0.5848
39998,0.000,0.678,0.322,0.2263


In [385]:
train_emb_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.141736,-0.544553,-0.185595,-0.226111,-0.258681,-0.707513,0.827305,-0.347276,0.306913,-0.178794,...,0.117562,-0.401929,0.283496,-0.023253,-0.071337,0.319748,0.582268,-0.480418,-0.091490,-0.270181
1,-0.159905,-0.274271,0.193665,-0.062508,0.319862,-0.233696,0.547739,-0.243858,0.152707,0.101391,...,-0.016827,-0.492055,0.451817,0.152209,-0.255176,-0.003595,0.271348,-0.077961,-0.140713,-0.099328
2,-0.184300,-0.820776,-0.198533,-0.126704,-0.218556,-0.581000,0.685292,0.000823,0.169189,0.041687,...,-0.235139,-0.354844,0.275892,-0.079320,-0.172949,0.141234,0.488196,-0.487415,0.268620,-0.263073
3,-0.118453,0.008326,0.502737,-0.263399,-0.555033,-0.100746,1.178759,0.401233,0.298557,0.144832,...,-0.038454,0.144181,0.059292,0.184386,-0.064873,0.448778,0.472825,-0.058554,-0.128914,-0.508223
4,-0.408979,0.232619,-0.017716,-0.248017,-0.224253,-0.033788,0.821358,0.391244,0.174423,-0.277722,...,0.441039,-0.675858,0.317422,0.092662,0.196140,0.441943,0.580520,0.128066,0.213399,-0.144494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,-0.430240,-0.382024,-0.316330,-0.309180,0.120179,-0.428284,0.831630,-0.048464,0.182245,0.094724,...,-0.013516,-0.120240,0.318467,0.132105,-0.052484,-0.055811,0.305198,-0.442307,0.294606,-0.031146
39996,-0.338988,-0.482711,0.209065,-0.207502,-0.181979,-0.258415,0.691391,0.322848,0.253605,-0.090219,...,-0.095638,-0.143568,0.266862,0.326890,-0.208161,0.205226,0.805715,-0.275289,0.198044,0.012185
39997,-0.293665,0.241879,-0.232575,-0.098800,-0.169655,-0.621847,0.139723,0.340485,0.038315,-0.112721,...,0.122672,-0.897764,0.242542,0.295738,-0.303057,0.061978,0.163468,-0.160094,-0.040386,-0.061228
39998,-0.034713,-0.587140,-0.586187,0.073634,0.016169,-0.229916,-0.416593,0.191784,0.398879,-0.663269,...,0.450470,-0.237810,0.212717,-0.538136,0.020622,0.078669,-0.449346,-0.086853,-0.273231,-0.576859


In [387]:
train_emb_features_more = pd.concat([train_emb_features, polarityf], axis=1)
dev_emb_features_more = pd.concat([dev_emb_features, devpolarityf], axis=1)
dev_emb_features_more

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,378,379,380,381,382,383,neg,neu,pos,compound
0,0.647439,-0.160090,-0.038481,-0.055743,-0.026119,-0.319144,0.016490,-0.523501,-0.423618,-0.144300,...,0.031740,0.009924,0.070147,-0.056241,-0.597363,-0.147419,0.000,1.000,0.000,0.0000
1,-0.464021,-0.016592,-0.115892,0.066312,-0.375523,-0.073790,0.489918,0.215952,-0.256112,0.171260,...,0.376908,0.371955,0.634615,-0.743679,0.213966,0.108167,0.000,1.000,0.000,0.0000
2,-0.213534,-0.138032,0.006368,0.138779,0.597589,-0.454876,-0.455043,0.716374,0.346029,0.130042,...,-0.134962,0.589568,-0.023502,0.311702,-0.610152,0.472914,0.000,1.000,0.000,0.0000
3,-0.116541,0.066684,-0.162548,0.283333,-0.324099,-0.180485,1.083803,-0.622489,0.022407,-0.292906,...,-0.295738,0.179404,0.937333,-0.181149,-0.435988,-0.094660,0.473,0.310,0.217,-0.5106
4,-0.073845,0.119823,-0.140898,-0.030531,-0.156140,-0.122156,0.522053,0.091225,0.184444,-0.130128,...,-0.309555,-0.061245,0.555611,-0.190059,-0.118419,0.462156,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.330185,0.128467,0.135211,-0.603940,-0.680551,0.436781,0.587363,0.700500,0.072278,-0.053482,...,-1.076681,0.068563,0.018803,-0.404385,-0.406231,0.345451,0.000,1.000,0.000,0.0000
3996,0.268135,-0.193721,0.505916,0.017008,0.780905,-0.351398,0.126403,0.176521,-0.070430,-0.027673,...,-0.309937,-0.249904,0.387103,-0.089303,-0.185726,0.358549,0.000,1.000,0.000,0.0000
3997,-0.410717,-0.416210,-0.008091,-0.155333,0.037716,-0.662711,0.769891,0.196149,0.079130,-0.289609,...,-0.249280,0.235375,0.659818,0.029833,0.191358,-0.102210,0.000,0.818,0.182,0.4404
3998,-0.138316,0.229265,0.170203,-0.347052,-0.093248,0.015622,0.594051,-0.099069,0.197505,-0.444591,...,0.083219,0.075853,0.293239,0.207108,0.382951,-0.078980,0.000,0.872,0.128,0.2942


In [388]:
emb_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(train_emb_features_more, train_emb_labels)
emb_lr_rpred = emb_lr_pred.predict(dev_emb_features_more)
lr_emb=emb_lr_pred.score(dev_emb_features_more, dev_emb_labels)
lr_emb

0.69175

In [390]:
emb_gnb_pred = gnb.fit(train_emb_features_more, train_emb_labels).predict(dev_emb_features_more)
print("Number of mislabeled points out of a total %d points : %d" % (dev_emb_features_more.shape[0], (dev_emb_labels != emb_gnb_pred ).sum()))
gnb_emb=gnb.score(dev_emb_features_more,dev_emb_labels)
gnb_emb

Number of mislabeled points out of a total 4000 points : 1519


0.62025

In [391]:
emb_lr_pred = LogisticRegression(max_iter=1000,random_state=0).fit(polarityf, train_emb_labels)
emb_lr_rpred = emb_lr_pred.predict(devpolarityf)
lr_emb=emb_lr_pred.score(devpolarityf, dev_emb_labels)
lr_emb

0.5695

In [312]:
kaggle=semi_lr_model.predict(test_emb_features)
kaggle = pd.DataFrame(kaggle, columns = ['Category'])

kaggle.to_csv("kaggle.csv",header=True ,index=True)