In [0]:
import pandas as pd
df = pd.read_table("offenseval-training-v1.tsv",sep='\t',index_col=False, error_bad_lines=False) 

In [0]:
X_test.subtask_c.value_counts()

IND    482
GRP    215
OTH     79
Name: subtask_c, dtype: int64

### Machine Learning Models

In [0]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
  
pd.options.mode.chained_assignment = None

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier


In [0]:
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    lm_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lm_tokens)

In [0]:
def clean_text(content):
  content=content.str.lower()                                                         # Convert to Lowercase
  content=content.str.replace('@USER','')                                     # Remove triggerword tags
  content=content.str.replace('username','')                                          # Remove username tags
  content=content.str.replace('http\S+|www.\S+', '')                                  # Remove Links
  content=content.str.replace('\s+', ' ')                                             # Remove multiple spaces
  content=content.str.replace('[^A-Za-z\s]+', '')                                     # Remove irrelevant characters other than alphabets and space
  #content=content.apply(lemmatize_sentences)
  return content

In [0]:
df['tweet'] = clean_text(df['tweet'])

In [0]:
df = df.loc[df['subtask_a'] == 'OFF']
df = df.loc[df['subtask_b'] == 'TIN']
df = df.drop(['subtask_a','subtask_b'],axis=1)

In [0]:
taskc_data = df[['tweet','subtask_c']]

In [0]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(taskc_data, test_size=0.2, random_state=0, stratify=taskc_data['subtask_c'])

In [0]:
X_train.shape

(3100, 2)

In [0]:
X_train.to_csv('original_train.tsv',sep='\t',index=False)
X_test.to_csv('original_dev.tsv',sep='\t',index=False)

In [0]:
X_train = pd.read_csv('original_train.tsv',sep='\t')
X_test = pd.read_csv('original_dev.tsv',sep='\t')

le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
from sklearn.model_selection import StratifiedKFold
lr_fit = ROS_pipeline.fit(X_train.tweet, X_train.subtask_c)
model1_probs = lr_fit.predict_proba(X_test.tweet)

In [0]:
# Print f1 score and confusion matrix of best model

f1 score: 0.56822 
[[135  48  32]
 [ 68 393  21]
 [ 28  30  21]]


# Second model (All classes equal)

In [0]:
X_train = pd.read_csv('all_train_data.tsv',sep='\t') 
X_test = pd.read_csv('all_dev_data.tsv',sep='\t') 
le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 2))
model = LogisticRegression(solver='lbfgs')
NM2_pipeline = make_pipeline(tvec, NearMiss(ratio='not minority',random_state=777, version = 2),model)
lr_fit = NM2_pipeline.fit(X_train.tweet, X_train.subtask_c)
model2_probs = lr_fit.predict_proba(X_test.tweet)

# Third model (Make OTH classes equal)

In [0]:
X_train = pd.read_csv('oth_train_data.tsv',sep='\t') 
X_test = pd.read_csv('oth_dev_data.tsv',sep='\t') 
le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)
lr_fit = ROS_pipeline.fit(X_train.tweet, X_train.subtask_c)
model3_probs = lr_fit.predict_proba(X_test.tweet)

# Fourth model (Make grp classes equal)

In [0]:
X_train = pd.read_csv('grp_train_data.tsv',sep='\t') 
X_test = pd.read_csv('grp_dev_data.tsv',sep='\t') 
le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_fit = ROS_pipeline.fit(X_train.tweet, X_train.subtask_c)
model4_probs = lr_fit.predict_proba(X_test.tweet)

In [0]:
final_prob = (model2_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.54319 
[[124  51  40]
 [ 71 384  27]
 [ 26  32  21]]


In [0]:
final_prob = (model3_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.54665 
[[119  38  58]
 [ 63 365  54]
 [ 22  27  30]]


In [0]:
final_prob = (model4_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.56126 
[[141  44  30]
 [ 95 373  14]
 [ 27  31  21]]


In [0]:
final_prob = 0.5*(model1_probs + model2_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.54792 
[[129  50  36]
 [ 69 390  23]
 [ 30  30  19]]


In [0]:
final_prob = 0.5*(model1_probs + model3_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.55235 
[[129  45  41]
 [ 62 383  37]
 [ 26  31  22]]


In [0]:
final_prob = 0.5*(model1_probs + model4_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.57605 
[[142  45  28]
 [ 80 388  14]
 [ 27  31  21]]


In [0]:
final_prob = 0.5*(model2_probs + model3_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.54660 
[[125  48  42]
 [ 68 379  35]
 [ 27  29  23]]


In [0]:
final_prob = 0.5*(model2_probs + model4_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.54629 
[[137  47  31]
 [ 79 379  24]
 [ 29  32  18]]


In [0]:
final_prob = 0.5*(model3_probs + model4_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.55645 
[[135  43  37]
 [ 77 373  32]
 [ 25  31  23]]


In [0]:
final_prob = (model1_probs + model2_probs+model3_probs)/3
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.55762 
[[131  45  39]
 [ 69 381  32]
 [ 25  31  23]]


In [0]:
final_prob = (model1_probs + model2_probs+model4_probs)/3
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.56236 
[[140  44  31]
 [ 74 388  20]
 [ 30  30  19]]


In [0]:
final_prob = (model1_probs + model4_probs+model3_probs)/3
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.57492 
[[139  42  34]
 [ 73 386  23]
 [ 25  31  23]]


In [0]:
final_prob = (model4_probs + model2_probs+model3_probs)/3
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.54986 
[[135  42  38]
 [ 75 379  28]
 [ 29  30  20]]


In [0]:
final_prob = 0.25*(model1_probs + model2_probs+model3_probs+model4_probs)
final_preds = np.argmax(final_prob,axis=1)
f1=f1_score(X_test.subtask_c, final_preds, average='macro')
print("f1 score: {:.5f} ".format(f1))
print(confusion_matrix(X_test.subtask_c, final_preds))

f1 score: 0.55876 
[[135  44  36]
 [ 72 384  26]
 [ 28  30  21]]


# Get text predictions

In [0]:
test = pd.read_csv('test_set_taskc.tsv',sep='\t')
test.tweet = clean_text(test.tweet)

In [0]:
X_train = pd.read_csv('original_train.tsv',sep='\t')
X_test = pd.read_csv('original_dev.tsv',sep='\t')
X_train = X_train.append(X_test)
le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
from sklearn.model_selection import StratifiedKFold
lr_fit = ROS_pipeline.fit(X_train.tweet, X_train.subtask_c)
model1_probs = lr_fit.predict_proba(test.tweet)

In [0]:
X_train = pd.read_csv('oth_train_data.tsv',sep='\t') 
X_test = pd.read_csv('oth_dev_data.tsv',sep='\t') 
X_train = X_train.append(X_test)
le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)
lr_fit = ROS_pipeline.fit(X_train.tweet, X_train.subtask_c)
model3_probs = lr_fit.predict_proba(test.tweet)

In [0]:
X_train = pd.read_csv('grp_train_data.tsv',sep='\t') 
X_test = pd.read_csv('grp_dev_data.tsv',sep='\t') 
X_train = X_train.append(X_test)
le1=LabelEncoder()
X_train.subtask_c=le1.fit_transform(X_train.subtask_c)
X_test.subtask_c=le1.transform(X_test.subtask_c)
tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1,2))
model1 = LogisticRegression(solver='lbfgs')
model2 = AdaBoostClassifier()
model3 = XGBClassifier()
# create the ensemble model
eclf1 = VotingClassifier(estimators=[('LogReg', model1), ('adaboost', model2), ('xgboost', model3)], voting='soft')

ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),eclf1)
lr_fit = ROS_pipeline.fit(X_train.tweet, X_train.subtask_c)
model4_probs = lr_fit.predict_proba(test.tweet)

In [0]:
'''   
final_pred=[0]*len(first_pred)
for i in range(len(first_pred)):
  if(first_pred[i] == 0):
    final_pred[i]='GRP'
  elif(second_pred[i] == 1):
    final_pred[i]='IND'
  else:
    final_pred[i]='OTH'
    
'''
final_prob = (model1_probs + model4_probs+model3_probs)/3
prediction = np.argmax(final_prob,axis=1)

In [0]:
unique, counts = np.unique(prediction, return_counts=True)

print(np.asarray((unique, counts)).T)

[[  0  94]
 [  1 105]
 [  2  14]]


In [0]:
prediction = np.where(prediction == 0, 'GRP', 
                      np.where(prediction == 1, 'IND', 'OTH'))

In [0]:
results = pd.DataFrame({'id':test.id, 'subtask_b':prediction})
results.to_csv('submission3.csv',sep=',',header=False,index=False)
files.download('submission3.csv')