In [0]:
import pandas as pd
X_train = pd.read_csv('grp_train_data.tsv',sep='\t') 
X_test = pd.read_csv('grp_dev_data.tsv',sep='\t') 

## Subtask C

In [0]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
  
pd.options.mode.chained_assignment = None
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [0]:
y_train=X_train.subtask_c
y_test=X_test.subtask_c
X_train = X_train.drop(['subtask_c'],axis=1)
X_test = X_test.drop(['subtask_c'],axis=1)

# Label encode the target variable of train & dev data
le1=LabelEncoder()
y_train=le1.fit_transform(y_train)
y_test=le1.transform(y_test)

In [0]:
X_test.shape

(776, 1)

In [0]:
y_pred=np.ones(len(y_test))
macro_f1= f1_score(y_pred,y_test, average='macro')
print(macro_f1)

0.25543190249072606


  'recall', 'true', average, warn_for)


In [0]:
def subtask_c_metrics(model):
    y_pred = model.predict(X_test.tweet)
    macro_f1= f1_score(y_test, y_pred, average='macro')
    print(macro_f1)

In [0]:
# Machine Learning
# Training Naive Bayes (NB) classifier + BOW model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(nb_clf)

0.4596072954594576


In [0]:
# Training Linear SVM classifier + BOW model

from sklearn.svm import LinearSVC

svc_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', LinearSVC())])
svc_clf = svc_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(svc_clf)

0.49457820451885737


In [0]:
# Training Logisitic Regression(LogReg) classifier + BOW model 

from sklearn.linear_model import LogisticRegression 

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),  ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(lg_clf)

0.4957881704654459


In [0]:
# Training SVM + BOW + TF-IDF model

svm_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())])
svm_clf = svm_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(svm_clf)

0.49447311866100274


In [0]:
# Training NB + BOW + TF-IDF model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(nb_clf)

0.43603190577360323


  'precision', 'predicted', average, warn_for)


In [0]:
# Training LogReg + BOW + TF-IDF model

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(lg_clf)

0.5089303730608078




In [0]:
# Training Ensemble model

from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
model2 = MultinomialNB()
model3 = LinearSVC()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('lr', model1), ('nb', model2), ('svm', model3)], voting='hard')
ensemble = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', eclf1)])
ensemble = ensemble.fit(X_train.tweet, y_train)
subtask_c_metrics(ensemble)



0.49430761801279727


In [0]:
# Training XGBoost + BOW + TF-IDF model
from xgboost import XGBClassifier

xg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', XGBClassifier())])
xg_clf = xg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(xg_clf)

0.47570993914807297


In [0]:
from sklearn.ensemble import RandomForestClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(rg_clf)

0.44010633691060397


In [0]:
from sklearn.ensemble import AdaBoostClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', AdaBoostClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(rg_clf)

0.4680981696498438


In [0]:
from sklearn.ensemble import GradientBoostingClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_c_metrics(rg_clf)

0.5089956375670662


### Sampling Techniques

In [0]:
from sklearn.model_selection import StratifiedKFold
def lr_cv(pipeline,print_conf = False):
    
    #X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=0, stratify=Y)
    lr_fit = pipeline.fit(X_train.tweet, y_train)
    prediction = lr_fit.predict(X_test.tweet)
    f1=f1_score(y_test, prediction, average='macro')
    print("f1 score: {:.5f} ".format(f1))
    if(print_conf):
      print(confusion_matrix(y_test,prediction))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

### Logistic Regression

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.54492 


f1 score: 0.55583 


f1 score: 0.48659 


f1 score: 0.43587 


f1 score: 0.46275 


f1 score: 0.47978 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.54377 


f1 score: 0.54173 


f1 score: 0.47678 


f1 score: 0.39886 


f1 score: 0.48516 


f1 score: 0.42069 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.50954 


f1 score: 0.51354 


f1 score: 0.43896 


f1 score: 0.36574 


f1 score: 0.43413 


f1 score: 0.41786 


### SVM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.52071 
f1 score: 0.51508 
f1 score: 0.47344 
f1 score: 0.41580 
f1 score: 0.46023 
f1 score: 0.45689 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.52774 
f1 score: 0.52891 
f1 score: 0.47275 
f1 score: 0.38929 
f1 score: 0.48998 
f1 score: 0.42903 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.49524 
f1 score: 0.49977 
f1 score: 0.43896 
f1 score: 0.35665 
f1 score: 0.43162 
f1 score: 0.41912 


### Bagging Classifier

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.46770 
f1 score: 0.46746 
f1 score: 0.44918 
f1 score: 0.45982 
f1 score: 0.42081 
f1 score: 0.45390 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.46783 
f1 score: 0.45258 
f1 score: 0.46603 
f1 score: 0.44666 
f1 score: 0.31747 
f1 score: 0.40687 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.45499 
f1 score: 0.45395 
f1 score: 0.42150 
f1 score: 0.44072 
f1 score: 0.32971 
f1 score: 0.43277 


### XGBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline,True)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.52877 
f1 score: 0.46107 
[[176  35   4]
 [164 315   3]
 [ 53  23   3]]
f1 score: 0.47935 
f1 score: 0.48941 
f1 score: 0.33321 
f1 score: 0.46486 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.54019 
f1 score: 0.45629 
f1 score: 0.47819 
f1 score: 0.46204 
f1 score: 0.30129 
f1 score: 0.42724 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.47970 
f1 score: 0.45687 
f1 score: 0.43192 
f1 score: 0.44285 
f1 score: 0.24370 
f1 score: 0.41442 


### GBM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.55438 
f1 score: 0.52733 
f1 score: 0.47673 
f1 score: 0.47678 
f1 score: 0.41130 
f1 score: 0.44598 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.55221 
f1 score: 0.53462 
f1 score: 0.46507 
f1 score: 0.48101 
f1 score: 0.34566 
f1 score: 0.44091 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.48491 
f1 score: 0.49571 
f1 score: 0.41812 
f1 score: 0.40569 
f1 score: 0.29484 
f1 score: 0.40658 


### AdaBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.48228 
f1 score: 0.46401 
f1 score: 0.43174 
f1 score: 0.40540 
f1 score: 0.35179 
f1 score: 0.41677 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.48888 
f1 score: 0.49385 
f1 score: 0.42684 
f1 score: 0.44360 
f1 score: 0.37683 
f1 score: 0.41680 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(ROS_pipeline)
lr_cv(SMOTE_pipeline)
lr_cv(RUS_pipeline)
lr_cv(NM1_pipeline)
lr_cv(NM2_pipeline)
lr_cv(NM3_pipeline)

f1 score: 0.44804 
f1 score: 0.45963 
f1 score: 0.39424 
f1 score: 0.40201 
f1 score: 0.29271 
f1 score: 0.40453 


### Ensemble

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = GradientBoostingClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.55503 
[[141  41  33]
 [ 91 373  18]
 [ 28  31  20]]


f1 score: 0.52389 
[[155  51   9]
 [120 356   6]
 [ 37  31  11]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = GradientBoostingClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.54503 
[[143  41  31]
 [103 343  36]
 [ 29  26  24]]


f1 score: 0.53550 
[[166  42   7]
 [125 353   4]
 [ 38  30  11]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.56126 
[[141  44  30]
 [ 95 373  14]
 [ 27  31  21]]


f1 score: 0.50848 
[[159  47   9]
 [124 354   4]
 [ 41  30   8]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(SMOTE_pipeline,True)


f1 score: 0.52483 
[[139  41  35]
 [103 336  43]
 [ 32  25  22]]


f1 score: 0.50136 
[[160  40  15]
 [128 346   8]
 [ 42  29   8]]
