In [0]:
%%capture
!pip install GoogleDriveDownloader
!pip install -U imbalanced-learn

In [0]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1DnWHFdfNYEpvegCNNPzCWpNXUuK9t7FC',     # Id of file to be downloaded
                                    dest_path='data/offenseval-training-v1.tsv')     # Destination path

Downloading 1DnWHFdfNYEpvegCNNPzCWpNXUuK9t7FC into data/offenseval-training-v1.tsv... Done.


In [0]:
import pandas as pd
df = pd.read_table("data/offenseval-training-v1.tsv",sep='\t',index_col=False, error_bad_lines=False) 

In [0]:
df.head(10)

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
5,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH
6,77444,@USER @USER Oh noes! Tough shit.,OFF,UNT,
7,52415,@USER was literally just talking about this lo...,OFF,TIN,GRP
8,45157,@USER Buy more icecream!!!,NOT,,
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF,TIN,IND


In [0]:
len(df.subtask_a.unique())

2

In [0]:
len(df.subtask_b.unique())

3

In [0]:
len(df.subtask_c.unique())

4

### Machine Learning Models

In [0]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
  
pd.options.mode.chained_assignment = None

In [0]:
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    lm_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lm_tokens)

In [0]:
def clean_text(content):
  content=content.str.lower()                                                         # Convert to Lowercase
  content=content.str.replace('@USER','')                                     # Remove triggerword tags
  content=content.str.replace('username','')                                          # Remove username tags
  content=content.str.replace('http\S+|www.\S+', '')                                  # Remove Links
  content=content.str.replace('\s+', ' ')                                             # Remove multiple spaces
  content=content.str.replace('[^A-Za-z\s]+', '')                                     # Remove irrelevant characters other than alphabets and space
  #content=content.apply(lemmatize_sentences)
  return content

In [0]:
df['tweet'] = clean_text(df['tweet'])

In [0]:
df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [0]:
df.subtask_b.value_counts()

TIN    3876
UNT     524
Name: subtask_b, dtype: int64

In [0]:
df.subtask_c.value_counts()

IND    2407
GRP    1074
OTH     395
Name: subtask_c, dtype: int64

In [0]:
df = df.loc[df['subtask_a'] == 'OFF']
df = df.drop('subtask_a',axis=1)
df.head(10)

Unnamed: 0,id,tweet,subtask_b,subtask_c
0,86426,user she should ask a few native americans wha...,UNT,
1,90194,user user go home youre drunk user maga trump ...,TIN,IND
3,62688,user someone shouldvetaken this piece of shit ...,UNT,
5,97670,user liberals are all kookoo,TIN,OTH
6,77444,user user oh noes tough shit,UNT,
7,52415,user was literally just talking about this lol...,TIN,GRP
9,13384,user canada doesnt need another cuck we alread...,TIN,IND
12,28414,user you are a lying corrupt traitor nobody wa...,TIN,IND
19,28195,user user user gun control that is all these k...,TIN,OTH
20,56117,user user user user lol throwing the bullshit ...,TIN,IND


In [0]:
df.isna().sum()

id             0
tweet          0
subtask_b      0
subtask_c    524
dtype: int64

## Subtask B

In [0]:
taskb_data = df[['id','tweet','subtask_b']]

In [0]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(taskb_data, test_size=0.2, random_state=0, stratify=taskb_data['subtask_b'])

In [0]:
y_train1=X_train.subtask_b
y_test1=X_test.subtask_b
X_train = X_train.drop(['subtask_b'],axis=1)
X_test = X_test.drop(['subtask_b'],axis=1)

# Label encode the target variable of train & dev data
le1=LabelEncoder()
y_train1=le1.fit_transform(y_train1)
y_test1=le1.transform(y_test1)

In [0]:
y_pred=np.zeros(len(y_test1))
macro_f1= f1_score(y_pred,y_test1, average='macro')
print(macro_f1)

0.46827794561933533


In [0]:
def subtask_b_metrics(model):
    y_pred = model.predict(X_test.tweet)
    macro_f1= f1_score(y_test1, y_pred, average='macro')
    print(macro_f1)

In [0]:
# Machine Learning
# Training Naive Bayes (NB) classifier + BOW model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(nb_clf)

0.46827794561933533


In [0]:
# Training Linear SVM classifier + BOW model

from sklearn.svm import LinearSVC

svc_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', LinearSVC())])
svc_clf = svc_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(svc_clf)

0.5443974809458335


In [0]:
# Training Logisitic Regression(LogReg) classifier + BOW model 

from sklearn.linear_model import LogisticRegression 

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),  ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(lg_clf)

0.5547350427350428


In [0]:
# Training SVM + BOW + TF-IDF model

svm_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())])
svm_clf = svm_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(svm_clf)

0.52848395940301


In [0]:
# Training NB + BOW + TF-IDF model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(nb_clf)

0.46827794561933533


In [0]:
# Training LogReg + BOW + TF-IDF model

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(lg_clf)

0.5485949045939987


In [0]:
# Training Ensemble model

from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
model2 = MultinomialNB()
model3 = LinearSVC()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('lr', model1), ('nb', model2), ('svm', model3)], voting='hard')
ensemble = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', eclf1)])
ensemble = ensemble.fit(X_train.tweet, y_train1)
subtask_b_metrics(ensemble)

0.52848395940301


In [0]:
# Training XGBoost + BOW + TF-IDF model
from xgboost import XGBClassifier

xg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', XGBClassifier())])
xg_clf = xg_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(xg_clf)

0.5495495495495496


In [0]:
from sklearn.ensemble import RandomForestClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(rg_clf)

0.5420244600572469


In [0]:
from sklearn.ensemble import AdaBoostClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', AdaBoostClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(rg_clf)

0.5161003380181463


In [0]:
from sklearn.ensemble import GradientBoostingClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train1)
subtask_b_metrics(rg_clf)

0.48971102113261966


### Sampling Techniques

In [0]:
from sklearn.model_selection import StratifiedKFold
def lr_cv(X, Y, pipeline,print_conf = False):
    
    X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=0, stratify=Y)
    lr_fit = pipeline.fit(X_train, y_train)
    prediction = lr_fit.predict(X_test)
    scores = lr_fit.score(X_test,y_test)
    f1=f1_score(y_test, prediction, average='macro')
    print("f1 score: {:.5f} ".format(f1))
    if(print_conf):
      print(confusion_matrix(y_test,prediction))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

### Logistic Regression

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.59444 
f1 score: 0.58289 
f1 score: 0.55445 
f1 score: 0.29314 
f1 score: 0.52075 
f1 score: 0.48694 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.60165 
f1 score: 0.59121 
f1 score: 0.56255 
f1 score: 0.27813 
f1 score: 0.52208 
f1 score: 0.49444 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.60694 
f1 score: 0.59943 
f1 score: 0.55724 
f1 score: 0.27284 
f1 score: 0.52035 
f1 score: 0.50338 


### SVM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.57536 
f1 score: 0.57622 
f1 score: 0.52600 
f1 score: 0.36979 
f1 score: 0.53019 
f1 score: 0.48024 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.56754 
f1 score: 0.56336 
f1 score: 0.53835 
f1 score: 0.39920 
f1 score: 0.52530 
f1 score: 0.49158 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.57429 
f1 score: 0.56033 
f1 score: 0.54110 
f1 score: 0.35928 
f1 score: 0.51351 
f1 score: 0.50277 


### Bagging Classifier

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.58216 
f1 score: 0.57889 
f1 score: 0.51831 
f1 score: 0.31459 
f1 score: 0.46212 
f1 score: 0.45212 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.60165 
f1 score: 0.58415 
f1 score: 0.50990 
f1 score: 0.38452 
f1 score: 0.41564 
f1 score: 0.45080 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.57498 
f1 score: 0.58834 
f1 score: 0.51514 
f1 score: 0.38367 
f1 score: 0.43206 
f1 score: 0.43474 


### XGBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline,True)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.56705 
f1 score: 0.61248 
[[742  33]
 [ 81  24]]
f1 score: 0.49739 
f1 score: 0.32533 
f1 score: 0.43760 
f1 score: 0.46158 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.57451 
f1 score: 0.60091 
f1 score: 0.48008 
f1 score: 0.33862 
f1 score: 0.26333 
f1 score: 0.45082 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.56831 
f1 score: 0.60227 
f1 score: 0.49423 
f1 score: 0.35569 
f1 score: 0.28028 
f1 score: 0.41711 


### GBM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.57174 
f1 score: 0.58923 
f1 score: 0.51335 
f1 score: 0.31639 
f1 score: 0.44710 
f1 score: 0.43822 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.58618 
f1 score: 0.58705 
f1 score: 0.50043 
f1 score: 0.34052 
f1 score: 0.39805 
f1 score: 0.42305 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.59034 
f1 score: 0.59444 
f1 score: 0.51359 
f1 score: 0.35398 
f1 score: 0.38375 
f1 score: 0.42331 


### AdaBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.54144 
f1 score: 0.57612 
f1 score: 0.47714 
f1 score: 0.38036 
f1 score: 0.46364 
f1 score: 0.44813 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.55162 
f1 score: 0.58783 
f1 score: 0.48627 
f1 score: 0.35225 
f1 score: 0.41325 
f1 score: 0.41706 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_b, ROS_pipeline)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_b, RUS_pipeline)
lr_cv(df.tweet, df.subtask_b, NM1_pipeline)
lr_cv(df.tweet, df.subtask_b, NM2_pipeline)
lr_cv(df.tweet, df.subtask_b, NM3_pipeline)

f1 score: 0.54064 
f1 score: 0.57394 
f1 score: 0.48128 
f1 score: 0.37718 
f1 score: 0.41228 
f1 score: 0.42864 


### Ensemble

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_b, ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_b, SMOTE_pipeline,True)


f1 score: 0.61231 
[[693  82]
 [ 69  36]]
f1 score: 0.58439 
[[727  48]
 [ 83  22]]


In [0]:
X_train,X_test,y_train,y_test = train_test_split(df.tweet,df.subtask_b, test_size=0.2, random_state=0, stratify=df.subtask_b)

le1=LabelEncoder()
y_train=le1.fit_transform(y_train)
y_test=le1.transform(y_test)

def get_prob(pipeline,print_conf=False):
  lr_fit = pipeline.fit(X_train, y_train)
  prob = lr_fit.predict_proba(X_test)
  return prob

In [0]:
# Best models

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = XGBClassifier()
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
prob1= get_prob(SMOTE_pipeline)
# 0.60227

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = XGBClassifier()
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
prob2= get_prob(SMOTE_pipeline)
#0.60091

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
prob3= get_prob(SMOTE_pipeline)
#0.61248

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
prob4= get_prob(ROS_pipeline)
#0.60165

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
prob5= get_prob(ROS_pipeline)
#0.60694

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
prob6= get_prob(ROS_pipeline)
#0.60165

In [0]:
final_probs = (prob1+prob5+prob4)/3
final_preds=np.argmax(final_probs,axis=1)
f1_score(final_preds,y_test,average='macro')

0.6077939804175545

In [0]:
confusion_matrix(final_preds,y_test)

array([[725,  78],
       [ 50,  27]])

In [0]:
from google.colab import files
files.upload()

Saving testset-taskb.tsv to testset-taskb.tsv




In [0]:
test = pd.read_table("testset-taskb.tsv",sep='\t',index_col=False, error_bad_lines=False)

In [0]:
test['tweet'] = clean_text(test['tweet'])

In [0]:
def get_prob(pipeline,print_conf=False):
  lr_fit = pipeline.fit(X_train, y_train)
  prob = lr_fit.predict_proba(X_test)
  return prob

In [0]:
unique, counts = np.unique(prediction, return_counts=True)

print(np.asarray((unique, counts)).T)

[['TIN' 231]
 ['UNT' 9]]


In [0]:
results = pd.DataFrame({'id':test.id, 'subtask_b':prediction})
results.to_csv('submission1.csv',sep=',',header=False,index=False)

In [0]:
from google.colab import files
files.download('submission1.csv')

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_fit = ROS_pipeline.fit(df.tweet, df.subtask_b)
prediction = lr_fit.predict(test.tweet)

In [0]:
unique, counts = np.unique(prediction, return_counts=True)

print(np.asarray((unique, counts)).T)

[['TIN' 209]
 ['UNT' 31]]


In [0]:
results = pd.DataFrame({'id':test.id, 'subtask_b':prediction})
results.to_csv('submission2.csv',sep=',',header=False,index=False)
files.download('submission2.csv')