In [0]:
import pandas as pd
X_train = pd.read_table("new_train_data.tsv",sep='\t',index_col=False, error_bad_lines=False) 
X_test = pd.read_table("new_dev_data.tsv",sep='\t',index_col=False, error_bad_lines=False) 

In [0]:
X_train.head(10)

Unnamed: 0,tweet,subtask_b
0,anyone else think she is a lying opportunist ...,TIN
1,i love women like you we believe all women ex...,TIN
2,its a disgrace and are in danger of imploding...,TIN
3,but i thought california had gun control,TIN
4,mon bye bitches suffisait,TIN
5,you mean like all the liberals screaming at a...,TIN
6,and not all idiots grandstands like he did,TIN
7,conservatives the trailer trash party of canada,TIN
8,says the bitch ass boy hiding behind a keyboa...,TIN
9,bad gun control seems like missed his brains ...,TIN


### Machine Learning Models

In [0]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
  
pd.options.mode.chained_assignment = None

In [0]:
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    lm_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lm_tokens)

In [0]:
def clean_text(content):
  content=content.str.lower()                                                         # Convert to Lowercase
  content=content.str.replace('@USER','')                                     # Remove triggerword tags
  content=content.str.replace('username','')                                          # Remove username tags
  content=content.str.replace('http\S+|www.\S+', '')                                  # Remove Links
  content=content.str.replace('\s+', ' ')                                             # Remove multiple spaces
  content=content.str.replace('[^A-Za-z\s]+', '')                                     # Remove irrelevant characters other than alphabets and space
  #content=content.apply(lemmatize_sentences)
  return content

In [0]:
X_train['tweet'] = clean_text(X_train['tweet'])
X_test['tweet'] = clean_text(X_test['tweet'])

In [0]:
X_train.isna().sum()

tweet        0
subtask_b    0
dtype: int64

## Subtask B

In [0]:
y_train=X_train.subtask_b
y_test=X_test.subtask_b
X_train = X_train.drop(['subtask_b'],axis=1)
X_test = X_test.drop(['subtask_b'],axis=1)

# Label encode the target variable of train & dev data
le1=LabelEncoder()
y_train=le1.fit_transform(y_train)
y_test=le1.transform(y_test)

In [0]:
from sklearn.metrics import f1_score
y_pred=np.zeros(len(y_test))
macro_f1= f1_score(y_pred,y_test, average='macro')
print(macro_f1)

0.46827794561933533


In [0]:
def subtask_b_metrics(model):
    y_pred = model.predict(X_test.tweet)
    macro_f1= f1_score(y_test, y_pred, average='macro')
    print(macro_f1)

In [0]:
# Machine Learning
# Training Naive Bayes (NB) classifier + BOW model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(nb_clf)

0.6083120748299319


In [0]:
# Training Linear SVM classifier + BOW model

from sklearn.svm import LinearSVC

svc_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', LinearSVC())])
svc_clf = svc_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(svc_clf)

0.5671616201646473


In [0]:
# Training Logisitic Regression(LogReg) classifier + BOW model 

from sklearn.linear_model import LogisticRegression 

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),  ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(lg_clf)

0.5722660706078412


In [0]:
# Training SVM + BOW + TF-IDF model

svm_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())])
svm_clf = svm_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(svm_clf)

0.5779781315940917


In [0]:
# Training NB + BOW + TF-IDF model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(nb_clf)

0.38813648755606367


In [0]:
# Training LogReg + BOW + TF-IDF model

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(lg_clf)

0.5648882090267154


In [0]:
# Training Ensemble model

from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
model2 = MultinomialNB()
model3 = LinearSVC()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('lr', model1), ('nb', model2), ('svm', model3)], voting='hard')
ensemble = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', eclf1)])
ensemble = ensemble.fit(X_train.tweet, y_train)
subtask_b_metrics(ensemble)

0.580958715665674


In [0]:
# Training XGBoost + BOW + TF-IDF model
from xgboost import XGBClassifier

xg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', XGBClassifier())])
xg_clf = xg_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(xg_clf)

0.4000860532300695


In [0]:
from sklearn.ensemble import RandomForestClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(rg_clf)

0.5623150198836201


In [0]:
from sklearn.ensemble import AdaBoostClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', AdaBoostClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(rg_clf)

0.5013188874122148


In [0]:
from sklearn.ensemble import GradientBoostingClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train)
subtask_b_metrics(rg_clf)

0.4952866953925793


### Sampling Techniques

In [0]:
from sklearn.model_selection import StratifiedKFold
def lr_cv(pipeline,print_conf = False):
    
    lr_fit = pipeline.fit(X_train.tweet, y_train)
    prediction = lr_fit.predict(X_test.tweet)
    scores = lr_fit.score(X_test.tweet,y_test)
    f1=f1_score(y_test, prediction, average='macro')
    print("f1 score: {:.5f} ".format(f1))
    if(print_conf):
      print(confusion_matrix(y_test,prediction))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

### Logistic Regression

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.59832 
f1 score: 0.59879 
f1 score: 0.58909 
f1 score: 0.60165 
f1 score: 0.59156 
f1 score: 0.57817 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.59859 
f1 score: 0.58979 
f1 score: 0.59089 
f1 score: 0.57780 
f1 score: 0.59201 
f1 score: 0.56766 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.59426 
f1 score: 0.58651 
f1 score: 0.59975 
f1 score: 0.58435 
f1 score: 0.58203 


f1 score: 0.59073 


### SVM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.58096 
f1 score: 0.57534 
f1 score: 0.59888 
f1 score: 0.59556 
f1 score: 0.57691 
f1 score: 0.53915 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.57198 
f1 score: 0.57198 
f1 score: 0.56591 
f1 score: 0.58347 
f1 score: 0.54907 
f1 score: 0.53608 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.58015 
f1 score: 0.55168 
f1 score: 0.58347 
f1 score: 0.58840 
f1 score: 0.57427 


f1 score: 0.58280 


### Bagging Classifier

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.56710 
f1 score: 0.54953 
f1 score: 0.58465 
f1 score: 0.54751 
f1 score: 0.57063 
f1 score: 0.55035 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.55528 
f1 score: 0.54716 
f1 score: 0.56831 
f1 score: 0.52791 
f1 score: 0.57036 
f1 score: 0.54110 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.58412 
f1 score: 0.55662 
f1 score: 0.58133 
f1 score: 0.57211 
f1 score: 0.58200 


f1 score: 0.59481 


### XGBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.51661 
f1 score: 0.49714 
f1 score: 0.51241 
f1 score: 0.52304 
f1 score: 0.51885 
f1 score: 0.52845 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.49092 
f1 score: 0.44136 
f1 score: 0.48054 
f1 score: 0.50127 
f1 score: 0.47749 
f1 score: 0.48757 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.51415 
f1 score: 0.45424 
f1 score: 0.51144 
f1 score: 0.50294 
f1 score: 0.47864 


f1 score: 0.51006 


### GBM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.54464 
f1 score: 0.52457 
f1 score: 0.52974 
f1 score: 0.52368 
f1 score: 0.54863 
f1 score: 0.53860 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

f1 score: 0.52940 
f1 score: 0.50184 
f1 score: 0.53416 
f1 score: 0.53497 
f1 score: 0.52148 
f1 score: 0.52306 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

### AdaBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv( ROS_pipeline)
lr_cv( SMOTE_pipeline)
lr_cv( RUS_pipeline)
lr_cv( NM1_pipeline)
lr_cv( NM2_pipeline)
lr_cv( NM3_pipeline)

### Ensemble

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv( ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv( SMOTE_pipeline,True)


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv( ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv( SMOTE_pipeline,True)


### Get valid predictions from best model

In [0]:
# Training NB + BOW + TF-IDF model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train)
y_pred = nb_clf.predict(X_test.tweet)
print(confusion_matrix(y_pred,y_test))

[[717  76]
 [ 58  29]]


In [0]:
f1_score(y_pred,y_test,average="macro")

0.6083120748299319

In [0]:
unt_prob = nb_clf.predict_proba(X_test.tweet)

### Get best model for positive label 

In [0]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1DnWHFdfNYEpvegCNNPzCWpNXUuK9t7FC',     # Id of file to be downloaded
                                    dest_path='data/offenseval-training-v1.tsv')     # Destination path
df2 = pd.read_table("data/offenseval-training-v1.tsv",sep='\t',index_col=False, error_bad_lines=False) 
df2['tweet'] = clean_text(df2['tweet'])
df2 = df2.loc[df2['subtask_a'] == 'OFF']
df2 = df2.drop(['subtask_a','subtask_c'],axis=1)

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df2, test_size=0.2, random_state=0, stratify=df2['subtask_b'])

# Label encode the target variable of train & dev data
le1=LabelEncoder()
train.subtask_b=le1.fit_transform(train.subtask_b)
valid.subtask_b=le1.transform(valid.subtask_b)

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)

lr_fit = SMOTE_pipeline.fit(train.tweet, train.subtask_b)
prediction = lr_fit.predict(valid.tweet)
confusion_matrix(prediction,valid.subtask_b)

array([[742,  81],
       [ 33,  24]])

In [0]:
f1_score(prediction,y_test,average="macro")

0.6124785611644185

In [0]:
tin_prob = lr_fit.predict_proba(valid.tweet)

### Combine the best models for both labels

In [0]:
final_prob = 0.5*(tin_prob+unt_prob)
final_preds = np.argmax(final_prob,axis=1)

In [0]:
confusion_matrix(final_preds,valid.subtask_b)

array([[729,  77],
       [ 46,  28]])

In [0]:
f1_score(final_preds,y_test,average="macro")

0.6175251502655488

In [0]:
# Combine y_pred and prediction to get best model with F1 score
final_preds=[]

for i in range(0,len(y_pred)):
    if(tin_prob[i,0] > unt_prob[i,1]):
      final_preds.append(0)
    else:
      final_preds.append(1)

In [0]:
f1_score(final_preds,y_test,average="macro")

0.6175251502655488

In [0]:
tin_prob

array([[0.82634   , 0.17365998],
       [0.6176636 , 0.38233638],
       [0.88697934, 0.11302069],
       ...,
       [0.67040837, 0.32959163],
       [0.7402159 , 0.25978407],
       [0.5558852 , 0.44411483]], dtype=float32)

In [0]:
unt_prob

array([[0.47060146, 0.52939854],
       [0.62479884, 0.37520116],
       [0.55012653, 0.44987347],
       ...,
       [0.48304247, 0.51695753],
       [0.47390017, 0.52609983],
       [0.15655667, 0.84344333]])

### Get prediction

In [0]:
from google.colab import files
files.upload()

Saving testset-taskb.tsv to testset-taskb.tsv




In [0]:
test = pd.read_table("testset-taskb.tsv",sep='\t',index_col=False, error_bad_lines=False)
test['tweet'] = clean_text(test['tweet'])

In [0]:
le1=LabelEncoder()
df2.subtask_b=le1.fit_transform(df2.subtask_b)

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)

lr_fit = SMOTE_pipeline.fit(df2.tweet, df2.subtask_b)
prediction1 = lr_fit.predict_proba(test.tweet)

In [0]:
X_train = pd.read_table("new_train_data.tsv",sep='\t',index_col=False, error_bad_lines=False) 
X_test = pd.read_table("new_dev_data.tsv",sep='\t',index_col=False, error_bad_lines=False) 
X_train['tweet'] = clean_text(X_train['tweet'])
X_test['tweet'] = clean_text(X_test['tweet'])
le1=LabelEncoder()
X_train.subtask_b=le1.fit_transform(X_train.subtask_b)
X_test.subtask_b=le1.transform(X_test.subtask_b)

new_train = pd.concat([X_train,X_test])

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(new_train.tweet,new_train.subtask_b)
prediction2 = nb_clf.predict_proba(test.tweet)

In [0]:
final_prob = 0.5*(prediction1+prediction2)
final_preds = np.argmax(final_prob,axis=1)

In [0]:
unique, counts = np.unique(final_preds, return_counts=True)

print(np.asarray((unique, counts)).T)

[[  0 228]
 [  1  12]]


In [0]:
results = pd.DataFrame({'id':test.id, 'subtask_b':final_preds})
results.to_csv('submission3.csv',sep=',',header=False,index=False)
files.download('submission3.csv')