In [0]:
%%capture
!pip install GoogleDriveDownloader
!pip install -U imbalanced-learn

In [0]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1DnWHFdfNYEpvegCNNPzCWpNXUuK9t7FC',     # Id of file to be downloaded
                                    dest_path='data/offenseval-training-v1.tsv')     # Destination path

In [0]:
import pandas as pd
df = pd.read_table("data/offenseval-training-v1.tsv",sep='\t',index_col=False, error_bad_lines=False) 

In [0]:
df.head(10)

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
5,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH
6,77444,@USER @USER Oh noes! Tough shit.,OFF,UNT,
7,52415,@USER was literally just talking about this lo...,OFF,TIN,GRP
8,45157,@USER Buy more icecream!!!,NOT,,
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF,TIN,IND


In [0]:
len(df.subtask_a.unique())

2

In [0]:
len(df.subtask_c.unique())

4

In [0]:
len(df.subtask_c.unique())

4

### Machine Learning Models

In [0]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
  
pd.options.mode.chained_assignment = None

In [0]:
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    lm_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lm_tokens)

In [0]:
def clean_text(content):
  content=content.str.lower()                                                         # Convert to Lowercase
  content=content.str.replace('@USER','')                                     # Remove triggerword tags
  content=content.str.replace('username','')                                          # Remove username tags
  content=content.str.replace('http\S+|www.\S+', '')                                  # Remove Links
  content=content.str.replace('\s+', ' ')                                             # Remove multiple spaces
  content=content.str.replace('[^A-Za-z\s]+', '')                                     # Remove irrelevant characters other than alphabets and space
  #content=content.apply(lemmatize_sentences)
  return content

In [0]:
df['tweet'] = clean_text(df['tweet'])

In [0]:
df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [0]:
df.subtask_c.value_counts()

IND    2407
GRP    1074
OTH     395
Name: subtask_c, dtype: int64

In [0]:
df = df.loc[df['subtask_a'] == 'OFF']
df = df.loc[df['subtask_b'] == 'TIN']
df = df.drop(['subtask_a','subtask_b'],axis=1)
df.head(10)

Unnamed: 0,id,tweet,subtask_c
1,90194,user user go home youre drunk user maga trump ...,IND
5,97670,user liberals are all kookoo,OTH
7,52415,user was literally just talking about this lol...,GRP
9,13384,user canada doesnt need another cuck we alread...,IND
12,28414,user you are a lying corrupt traitor nobody wa...,IND
19,28195,user user user gun control that is all these k...,OTH
20,56117,user user user user lol throwing the bullshit ...,IND
22,12681,user user kind of like when conservatives wann...,GRP
23,82904,user user da fuck is going on people theres th...,GRP
25,77665,user tbh these days i just dont like people in...,IND


In [0]:
df.isna().sum()

id           0
tweet        0
subtask_c    0
dtype: int64

## Subtask B

In [0]:
taskc_data = df[['tweet','subtask_c']]

In [0]:
df.head(10)

Unnamed: 0,id,tweet,subtask_c
1,90194,user user go home youre drunk user maga trump ...,IND
5,97670,user liberals are all kookoo,OTH
7,52415,user was literally just talking about this lol...,GRP
9,13384,user canada doesnt need another cuck we alread...,IND
12,28414,user you are a lying corrupt traitor nobody wa...,IND
19,28195,user user user gun control that is all these k...,OTH
20,56117,user user user user lol throwing the bullshit ...,IND
22,12681,user user kind of like when conservatives wann...,GRP
23,82904,user user da fuck is going on people theres th...,GRP
25,77665,user tbh these days i just dont like people in...,IND


In [0]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(taskc_data, test_size=0.2, random_state=0, stratify=taskc_data['subtask_c'])

In [0]:
X_train.subtask_c.value_counts()

IND    1925
GRP     859
OTH     316
Name: subtask_c, dtype: int64

In [0]:
X_test.subtask_c.value_counts()

IND    482
GRP    215
OTH     79
Name: subtask_c, dtype: int64

In [0]:
y_train1=X_train.subtask_c
y_test1=X_test.subtask_c
X_train = X_train.drop(['subtask_c'],axis=1)
X_test = X_test.drop(['subtask_c'],axis=1)

# Label encode the target variable of train & dev data
le1=LabelEncoder()
y_train1=le1.fit_transform(y_train1)
y_test1=le1.transform(y_test1)

In [0]:
X_test.shape

(776, 1)

In [0]:
y_pred=np.ones(len(y_test1))
macro_f1= f1_score(y_pred,y_test1, average='macro')
print(macro_f1)

0.25543190249072606


  'recall', 'true', average, warn_for)


In [0]:
def subtask_c_metrics(model):
    y_pred = model.predict(X_test.tweet)
    macro_f1= f1_score(y_test1, y_pred, average='macro')
    print(macro_f1)

In [0]:
# Machine Learning
# Training Naive Bayes (NB) classifier + BOW model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(nb_clf)

0.42530956720650526


In [0]:
# Training Linear SVM classifier + BOW model

from sklearn.svm import LinearSVC

svc_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),  ('clf', LinearSVC())])
svc_clf = svc_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(svc_clf)

0.4858685498568707




In [0]:
# Training Logisitic Regression(LogReg) classifier + BOW model 

from sklearn.linear_model import LogisticRegression 

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),  ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(lg_clf)

0.48152401524635363




In [0]:
# Training SVM + BOW + TF-IDF model

svm_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())])
svm_clf = svm_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(svm_clf)

0.5004648881778463


In [0]:
# Training NB + BOW + TF-IDF model

nb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
nb_clf = nb_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(nb_clf)

0.26215870463101726


  'precision', 'predicted', average, warn_for)


In [0]:
# Training LogReg + BOW + TF-IDF model

lg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'))])
lg_clf = lg_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(lg_clf)

0.5158238686615384




In [0]:
# Training Ensemble model

from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
model2 = MultinomialNB()
model3 = LinearSVC()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('lr', model1), ('nb', model2), ('svm', model3)], voting='hard')
ensemble = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', eclf1)])
ensemble = ensemble.fit(X_train.tweet, y_train1)
subtask_c_metrics(ensemble)



0.5016922996673492


In [0]:
# Training XGBoost + BOW + TF-IDF model
from xgboost import XGBClassifier

xg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4))), ('tfidf', TfidfTransformer()), ('clf', XGBClassifier())])
xg_clf = xg_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(xg_clf)

0.45491585708977017


In [0]:
from sklearn.ensemble import RandomForestClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(rg_clf)

0.4376947692764599


In [0]:
from sklearn.ensemble import AdaBoostClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', AdaBoostClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(rg_clf)

0.4964762935080742


In [0]:
from sklearn.ensemble import GradientBoostingClassifier

rg_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier())])
rg_clf = rg_clf.fit(X_train.tweet,y_train1)
subtask_c_metrics(rg_clf)

0.5034021520416911


### Sampling Techniques

In [0]:
from sklearn.model_selection import StratifiedKFold
def lr_cv(X, Y, pipeline,print_conf = False):
    
    X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=0, stratify=Y)
    lr_fit = pipeline.fit(X_train, y_train)
    prediction = lr_fit.predict(X_test)
    scores = lr_fit.score(X_test,y_test)
    f1=f1_score(y_test, prediction, average='macro')
    print("f1 score: {:.5f} ".format(f1))
    if(print_conf):
      print(confusion_matrix(y_test,prediction))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

### Logistic Regression

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.55392 


f1 score: 0.53837 


f1 score: 0.51381 


f1 score: 0.40173 


f1 score: 0.48344 


f1 score: 0.45878 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.54876 


f1 score: 0.53942 


f1 score: 0.51042 


f1 score: 0.38284 


f1 score: 0.47305 


f1 score: 0.44768 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LogisticRegression(solver='lbfgs')
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.53384 


f1 score: 0.53552 


f1 score: 0.52193 


f1 score: 0.39922 


f1 score: 0.46909 


f1 score: 0.45647 


### SVM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.52890 
f1 score: 0.54475 
f1 score: 0.50122 
f1 score: 0.44525 
f1 score: 0.48035 
f1 score: 0.46342 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.52945 
f1 score: 0.52752 
f1 score: 0.51000 
f1 score: 0.46459 
f1 score: 0.47302 
f1 score: 0.45708 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = LinearSVC()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.50885 
f1 score: 0.51795 
f1 score: 0.50483 
f1 score: 0.45530 
f1 score: 0.45275 
f1 score: 0.45226 


### Bagging Classifier

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.47456 
f1 score: 0.47139 
f1 score: 0.47980 
f1 score: 0.37068 
f1 score: 0.44108 
f1 score: 0.37649 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.47760 
f1 score: 0.44967 
f1 score: 0.47946 
f1 score: 0.41221 
f1 score: 0.37949 
f1 score: 0.41435 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.51057 
f1 score: 0.44589 
f1 score: 0.46560 
f1 score: 0.42883 
f1 score: 0.34584 
f1 score: 0.40502 


### XGBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline,True)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.53939 
f1 score: 0.46875 
[[ 87 121   7]
 [ 36 445   1]
 [ 14  60   5]]
f1 score: 0.50467 
f1 score: 0.37483 
f1 score: 0.34249 
f1 score: 0.41976 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.53231 
f1 score: 0.44266 
f1 score: 0.49000 
f1 score: 0.43767 
f1 score: 0.22345 
f1 score: 0.44465 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = XGBClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.53682 
f1 score: 0.44925 
f1 score: 0.50211 
f1 score: 0.48912 
f1 score: 0.22641 
f1 score: 0.42321 


### GBM

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.54684 
f1 score: 0.53032 
f1 score: 0.50494 
f1 score: 0.36906 
f1 score: 0.40544 
f1 score: 0.41320 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.53496 
f1 score: 0.54003 
f1 score: 0.48922 
f1 score: 0.41634 
f1 score: 0.37076 
f1 score: 0.42415 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = GradientBoostingClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.53222 
f1 score: 0.49646 
f1 score: 0.48432 
f1 score: 0.46353 
f1 score: 0.36568 
f1 score: 0.42301 


### AdaBoost

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.50633 
f1 score: 0.51674 
f1 score: 0.45686 
f1 score: 0.36441 
f1 score: 0.37868 
f1 score: 0.38526 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.52000 
f1 score: 0.50847 
f1 score: 0.45573 
f1 score: 0.40233 
f1 score: 0.34502 
f1 score: 0.42193 


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 4))
model = AdaBoostClassifier()
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),model)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),model)
RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777),model)
NM1_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 1),model)
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
NM3_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 3, n_neighbors_ver3=4),model)

lr_cv(df.tweet, df.subtask_c, ROS_pipeline)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline)
lr_cv(df.tweet, df.subtask_c, RUS_pipeline)
lr_cv(df.tweet, df.subtask_c, NM1_pipeline)
lr_cv(df.tweet, df.subtask_c, NM2_pipeline)
lr_cv(df.tweet, df.subtask_c, NM3_pipeline)

f1 score: 0.51575 
f1 score: 0.50697 
f1 score: 0.45004 
f1 score: 0.42463 
f1 score: 0.40033 
f1 score: 0.39878 


### Ensemble

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = GradientBoostingClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline,True)


f1 score: 0.55828 
[[130  49  36]
 [ 67 391  24]
 [ 28  30  21]]


f1 score: 0.54885 
[[123  79  13]
 [ 53 420   9]
 [ 17  49  13]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = GradientBoostingClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline,True)


f1 score: 0.56159 
[[131  47  37]
 [ 67 374  41]
 [ 27  26  26]]


f1 score: 0.52676 
[[122  83  10]
 [ 52 422   8]
 [ 20  50   9]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline,True)


f1 score: 0.56822 
[[135  48  32]
 [ 68 393  21]
 [ 28  30  21]]


f1 score: 0.53397 
[[124  81  10]
 [ 48 427   7]
 [ 20  50   9]]


In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='hard')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, ROS_pipeline,True)

SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),eclf1)
lr_cv(df.tweet, df.subtask_c, SMOTE_pipeline,True)


f1 score: 0.54633 
[[130  45  40]
 [ 73 379  30]
 [ 30  28  21]]


f1 score: 0.52545 
[[129  75  11]
 [ 59 418   5]
 [ 26  45   8]]


In [0]:
from google.colab import files
files.upload()

Saving test_set_taskc.tsv to test_set_taskc.tsv




In [0]:
test_data = pd.read_csv('test_set_taskc.tsv',sep='\t')
test_data.tweet=clean_text(test_data.tweet)

In [0]:
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)

lr_fit = pipeline.fit(df.tweet,df.subtask_c)
prediction = lr_fit.predict(test_data.tweet)

In [0]:
unique, counts = np.unique(prediction, return_counts=True)

print(np.asarray((unique, counts)).T)

[['GRP' 91]
 ['IND' 107]
 ['OTH' 15]]


In [0]:
results = pd.DataFrame({'id':test_data.id, 'subtask_b':prediction})
results.to_csv('submission1.csv',sep=',',header=False,index=False)
files.download('submission1.csv')