In [1]:
import pandas as pd
import os, csv, re, nltk


In [2]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [3]:
train_path = os.path.join('SADATA2 - SADATA.tsv')

eng_stopwords = set(nltk.corpus.stopwords.words("english"))

tknzr = nltk.TweetTokenizer(strip_handles=True)

text, label1, label2, label3 = [], [], [], []
with open(os.path.abspath(train_path), 'r', encoding='utf-8',errors='ignore')as f:
    line_count = 0
    data = csv.reader(f, delimiter='\t' )
    for row in data:
        if line_count == 0:
            pass
        else:
            # text_input = remove_emoji(p.tokenize(p.clean(row[1])))
            text_input = tknzr.tokenize(row[0].strip())
            text_input = remove_emoji(" ".join([t for t in text_input if not t.startswith('http')]))
            text_input = tknzr.tokenize(text_input)
            text.append(" ".join([t for t in text_input if t not in eng_stopwords]))
            label1.append(row[1])
            label2.append(row[2])
            label3.append(row[3])
        line_count += 1

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# 
X_train, X_test, y_train, y_test = train_test_split(text, label1, test_size=0.30,stratify=label1)

tfidf_transformer = TfidfVectorizer()
tfidf_transformer.fit_transform(text)

X_train_tfidf = tfidf_transformer.transform(X_train)
X_test_tfidf = tfidf_transformer.transform(X_test)

print(X_test_tfidf.shape)
print(X_train_tfidf.shape)

(462, 7046)
(1076, 7046)


In [7]:
gnb = MultinomialNB()
sgd = SGDClassifier(penalty='l2', alpha=1e-3, max_iter=50)
lr = LogisticRegression()
lin_svc = SVC(kernel='linear',probability=True)
model = XGBClassifier()
rf=RandomForestClassifier(n_estimators=100)

In [9]:
from sklearn.ensemble import VotingClassifier
model1 = VotingClassifier(estimators=[('xgb',model),('rf',rf),('gnb', gnb),('svc',lin_svc),('lr', lr)], voting='soft')
model1.fit(X_train_tfidf, y_train)
model1.score(X_test_tfidf,y_test)



0.79004329004329

In [10]:
y_pred = model1.predict(X_test_tfidf)
print("TASK A")
print('\n')
print("model1 accuracy : ",accuracy_score(y_test, y_pred, normalize = True))
print('\n')
print("model1 Classification Report : ",classification_report(y_test, y_pred))

TASK A


model1 accuracy :  0.79004329004329


model1 Classification Report :                precision    recall  f1-score   support

         HOF       0.79      0.77      0.78       225
         NOT       0.79      0.81      0.80       237

    accuracy                           0.79       462
   macro avg       0.79      0.79      0.79       462
weighted avg       0.79      0.79      0.79       462



In [11]:
X_trainB, X_testB, y_trainB, y_testB = train_test_split(text, label2, test_size=0.30,stratify=label2)
tfidf_transformerB = TfidfVectorizer()
tfidf_transformerB.fit_transform(text)
X_train_tfidfB = tfidf_transformerB.transform(X_trainB)
X_test_tfidfB = tfidf_transformerB.transform(X_testB)
gnbB = MultinomialNB()
sgdB = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=50)
lrB = LogisticRegression()
lin_svcB = SVC(kernel='linear',probability=True)
modelB = XGBClassifier()
rfB=RandomForestClassifier(n_estimators=48)

In [12]:
from sklearn.ensemble import VotingClassifier
model2 = VotingClassifier(estimators=[('xgb',modelB),('rf',rfB),('gnb', gnbB),('svc',lin_svcB),('lr', lrB)], voting='soft')
model2.fit(X_train_tfidfB, y_trainB)
model2.score(X_test_tfidfB,y_testB)



0.6103896103896104

In [13]:
y_predB = model2.predict(X_test_tfidfB)
print("TASK B")
print('\n')
print("model2 accuracy : ",accuracy_score(y_testB, y_predB, normalize = True))
print('\n')
print("model2 Classification Report : ",classification_report(y_testB, y_predB))

TASK B


model2 accuracy :  0.6103896103896104


model2 Classification Report :                precision    recall  f1-score   support

        HATE       0.53      0.32      0.39       130
        NONE       0.65      0.93      0.76       237
        OFFN       0.48      0.22      0.30        95

    accuracy                           0.61       462
   macro avg       0.55      0.49      0.49       462
weighted avg       0.58      0.61      0.56       462



In [15]:
X_trainC, X_testC, y_trainC, y_testC = train_test_split(text, label3, test_size=0.30,stratify=label3)


tfidf_transformerC = TfidfVectorizer()
tfidf_transformerC.fit_transform(text)

X_train_tfidfC = tfidf_transformerC.transform(X_trainC)
X_test_tfidfC = tfidf_transformerC.transform(X_testC)

print(X_test_tfidfC.shape)
print(X_train_tfidfC.shape)

gnbC = MultinomialNB()
sgdC = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=50)
lrC = LogisticRegression()
lin_svcC = SVC(kernel='linear',probability=True)
modelC = XGBClassifier()
rfC=RandomForestClassifier(n_estimators=48)

(462, 7046)
(1076, 7046)


In [16]:
from sklearn.ensemble import VotingClassifier
model3 = VotingClassifier(estimators=[('xgb',modelC),('rf',rfC),('gnb', gnbC),('svc',lin_svcC),('lr', lrC)], voting='soft')
model3.fit(X_train_tfidfC, y_trainC)
model3.score(X_test_tfidfC,y_testC)



0.7640692640692641

In [17]:
y_predC = model3.predict(X_test_tfidfC)
print("TASK C")
print('\n')
print("model3 accuracy : ",accuracy_score(y_testC, y_predC, normalize = True))
print('\n')
print("model3 Classification Report : ",classification_report(y_testC, y_predC))

TASK C


model3 accuracy :  0.7640692640692641


model3 Classification Report :                precision    recall  f1-score   support

        NONE       0.78      0.87      0.83       237
         TIN       0.74      0.75      0.74       195
         UNT       0.00      0.00      0.00        30

    accuracy                           0.76       462
   macro avg       0.51      0.54      0.52       462
weighted avg       0.72      0.76      0.74       462

