In [0]:
import pandas as pd
X_train = pd.read_csv('all_train_data.tsv',sep='\t') 
X_test = pd.read_csv('all_dev_data.tsv',sep='\t') 

## Subtask C

In [0]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
  
pd.options.mode.chained_assignment = None
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [0]:
y_train=X_train.subtask_c
y_test=X_test.subtask_c
X_train = X_train.drop(['subtask_c'],axis=1)
X_test = X_test.drop(['subtask_c'],axis=1)

# Label encode the target variable of train & dev data
le1=LabelEncoder()
y_train=le1.fit_transform(y_train)
y_test=le1.transform(y_test)

In [0]:
X_test.shape

(776, 1)

In [0]:
y_pred=np.ones(len(y_test))
macro_f1= f1_score(y_pred,y_test, average='macro')
print(macro_f1)

0.25543190249072606


  'recall', 'true', average, warn_for)


In [0]:
def subtask_c_metrics(model):
    y_pred = model.predict(X_test.tweet)
    macro_f1= f1_score(y_test, y_pred, average='macro')
    print(macro_f1)

In [0]:
# Machine Learning
# Training Naive Bayes (NB) classifier + BOW model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(nb_clf)

0.5008277440117469


In [0]:
# Training Linear SVM classifier + BOW model

from sklearn.svm import LinearSVC

svc_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', LinearSVC())])
svc_clf = svc_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(svc_clf)

0.5087083296094379


In [0]:
# Training Logisitic Regression(LogReg) classifier + BOW model 

from sklearn.linear_model import LogisticRegression 

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),  ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(lg_clf)

0.508220225884794


In [0]:
# Training SVM + BOW + TF-IDF model

svm_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())])
svm_clf = svm_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(svm_clf)

0.538569860382148


In [0]:
# Training NB + BOW + TF-IDF model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(nb_clf)

0.23906426740009948


In [0]:
# Training LogReg + BOW + TF-IDF model

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(lg_clf)

0.5339274979086942




In [0]:
# Training Ensemble model

from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
model2 = MultinomialNB()
model3 = LinearSVC()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('lr', model1), ('nb', model2), ('svm', model3)], voting='hard')
ensemble = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', eclf1)])
ensemble = ensemble.fit(X_train.tweet, y_train)
subtask_c_metrics(ensemble)



0.5334608911869623


In [0]:
# Training XGBoost + BOW + TF-IDF model
from xgboost import XGBClassifier

xg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', XGBClassifier())])
xg_clf = xg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(xg_clf)

0.4442300976033608


In [0]:
from sklearn.ensemble import RandomForestClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(rg_clf)

0.4621590154226834


In [0]:
from sklearn.ensemble import AdaBoostClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', AdaBoostClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(rg_clf)

0.4315198748596998


In [0]:
from sklearn.ensemble import GradientBoostingClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(rg_clf)

0.4738127271426258


### Sampling Techniques

In [0]:
from sklearn.model_selection import StratifiedKFold
def lr_cv(pipeline,print_conf = False):
    
    #X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=0, stratify=Y)
    lr_fit = pipeline.fit(X_train.tweet, y_train)
    prediction = lr_fit.predict(X_test.tweet)
    f1=f1_score(y_test, prediction, average='macro')
    print("f1 score: {:.5f} ".format(f1))
    if(print_conf):
      print(confusion_matrix(y_test,prediction))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

### Logistic Regression

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.53361 


f1 score: 0.53386 


f1 score: 0.53872 


f1 score: 0.52361 


f1 score: 0.54319 




f1 score: 0.54126 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.52379 


f1 score: 0.52520 


f1 score: 0.52661 


f1 score: 0.49473 


f1 score: 0.52204 




f1 score: 0.52844 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.48803 


f1 score: 0.50012 


f1 score: 0.49483 


f1 score: 0.48456 


f1 score: 0.46920 




f1 score: 0.48785 


### SVM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.51210 
f1 score: 0.51341 
f1 score: 0.52141 
f1 score: 0.51887 
f1 score: 0.52129 




f1 score: 0.51958 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.49052 
f1 score: 0.51718 
f1 score: 0.50587 
f1 score: 0.49314 
f1 score: 0.48695 




f1 score: 0.48541 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.44682 
f1 score: 0.47394 
f1 score: 0.46861 
f1 score: 0.44161 
f1 score: 0.43493 




f1 score: 0.44884 


### Bagging Classifier

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.46273 
f1 score: 0.45828 
f1 score: 0.48728 
f1 score: 0.44990 
f1 score: 0.45810 




f1 score: 0.47354 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.44491 
f1 score: 0.47695 
f1 score: 0.46545 
f1 score: 0.49250 
f1 score: 0.50584 




f1 score: 0.47463 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.42307 
f1 score: 0.40289 
f1 score: 0.42068 
f1 score: 0.42844 
f1 score: 0.43269 




f1 score: 0.43063 


### XGBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline,True)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.49070 
f1 score: 0.47176 
[[ 96  20  99]
 [ 53 269 160]
 [ 19  16  44]]
f1 score: 0.49370 
f1 score: 0.50027 
f1 score: 0.49700 




f1 score: 0.47904 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.52654 
f1 score: 0.48877 
f1 score: 0.50428 
f1 score: 0.49592 
f1 score: 0.52342 




f1 score: 0.50102 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.44042 
f1 score: 0.47280 
f1 score: 0.42065 
f1 score: 0.45872 
f1 score: 0.46481 




f1 score: 0.46208 


### GBM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.48990 
f1 score: 0.49562 
f1 score: 0.51358 
f1 score: 0.52018 
f1 score: 0.49275 




f1 score: 0.50242 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.51707 
f1 score: 0.51612 
f1 score: 0.51057 
f1 score: 0.51733 
f1 score: 0.52909 




f1 score: 0.52595 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.45789 
f1 score: 0.47589 
f1 score: 0.44706 
f1 score: 0.47161 
f1 score: 0.45866 




f1 score: 0.47472 


### AdaBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.47743 
f1 score: 0.46684 
f1 score: 0.50378 
f1 score: 0.47275 
f1 score: 0.46772 




f1 score: 0.47035 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.47887 
f1 score: 0.46667 
f1 score: 0.50002 
f1 score: 0.49682 
f1 score: 0.51055 




f1 score: 0.48201 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.43808 
f1 score: 0.47743 
f1 score: 0.41880 
f1 score: 0.46084 
f1 score: 0.46232 




f1 score: 0.43772 


### Ensemble

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = GradientBoostingClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.52582 
[[116  38  61]
 [ 74 357  51]
 [ 24  28  27]]


f1 score: 0.52303 
[[117  34  64]
 [ 75 332  75]
 [ 24  22  33]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = GradientBoostingClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.50315 
[[108  37  70]
 [ 69 336  77]
 [ 24  26  29]]


f1 score: 0.50092 
[[116  25  74]
 [ 79 295 108]
 [ 23  19  37]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.53913 
[[124  42  49]
 [ 75 365  42]
 [ 23  31  25]]


f1 score: 0.53327 
[[120  37  58]
 [ 73 348  61]
 [ 22  27  30]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.51975 
[[119  40  56]
 [ 85 341  56]
 [ 24  27  28]]


f1 score: 0.51188 
[[118  28  69]
 [ 82 324  76]
 [ 25  23  31]]
