In [1]:
import nltk
import json
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def normalize_text(words):
    words = to_lowercase(words)
    return words

In [3]:
def tokenize(text):
    return nltk.word_tokenize(text)

In [4]:
def text_prepare(text):
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text

In [5]:
df=pd.read_csv('sp+Ip+sn+In.csv')
df['text'] = [text_prepare(x) for x in df['text']]
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [6]:
data = df.text
data_label = df.label

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, data_label, test_size=0.2, random_state=0)

In [8]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

## Random Forest

In [9]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 6000,stop_words='english',ngram_range = (1,6))

train_x_t = count_vec.fit_transform(X_train)
train_x = tfidf_transformer.fit_transform(train_x_t)
train_x_char=train_x.toarray()

testx_t=count_vec.transform(X_test)
test_x= tfidf_transformer.transform(testx_t)
test_x_char=test_x.toarray()

##### tfidf + character n-grams

In [10]:
Random_Forest = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_char,y_train)
pred=Random_Forest.predict(test_x_char)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  93.75
F1 Score:  93.82350708261895
Precision:  94.03408387974763


In [11]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 6000,stop_words='english',ngram_range = (1,2))

train_x_t = count_vec.fit_transform(X_train)
train_x_word = tfidf_transformer.fit_transform(train_x_t)

testx_t=count_vec.transform(X_test)
test_x_word= tfidf_transformer.transform(testx_t)

##### tfidf + word n-grams

In [12]:
Random_Forest = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_word,y_train)
pred=Random_Forest.predict(test_x_word)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  86.3425925925926
F1 Score:  86.79022512298978
Precision:  87.96528837386191


In [13]:
with open('trainx_emb_80%.json','r') as file:
    train_X_emb=json.load(file)
with open('testx_emb_20%.json','r') as file:
    test_X_emb=json.load(file)

In [14]:
train_x_tfidf_elmo=[train_X_emb[i]+list(train_x_char[i]) for i in range(len(train_X_emb))]
test_x_tfidf_elmo=[test_X_emb[i]+list(test_x_char[i]) for i in range(len(test_X_emb))]

##### elmo embeddings

In [15]:
Random_Forest = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_X_emb,y_train)
pred=Random_Forest.predict(test_X_emb)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  89.35185185185185
F1 Score:  89.68530349578737
Precision:  90.729342790768


##### tfidf + elmo embeddings

In [16]:
Random_Forest = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_tfidf_elmo,y_train)
pred=Random_Forest.predict(test_x_tfidf_elmo)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  91.20370370370371
F1 Score:  91.50503708381206
Precision:  92.65508085098583


# SVM

In [17]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 6000,stop_words='english',ngram_range = (1,6))

train_x_t = count_vec.fit_transform(X_train)
train_x = tfidf_transformer.fit_transform(train_x_t)
train_x_char=train_x.toarray()

testx_t=count_vec.transform(X_test)
test_x= tfidf_transformer.transform(testx_t)
test_x_char=test_x.toarray()

##### tfidf + character n-grams

In [18]:
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(train_x_char,y_train)
pred=SVM.predict(test_x_char)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  93.05555555555556
F1 Score:  93.09833661669282
Precision:  93.18154013223912


In [19]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 6000,stop_words='english',ngram_range = (1,2))

train_x_t = count_vec.fit_transform(X_train)
train_x_word = tfidf_transformer.fit_transform(train_x_t)

testx_t=count_vec.transform(X_test)
test_x_word= tfidf_transformer.transform(testx_t)

##### tfidf + word n-grams

In [20]:
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(train_x_word,y_train)
pred=SVM.predict(test_x_word)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  84.72222222222221
F1 Score:  84.63701348119157
Precision:  84.58757749460459


In [21]:
with open('trainx_emb_80%.json','r') as file:
    train_X_emb=json.load(file)
with open('testx_emb_20%.json','r') as file:
    test_X_emb=json.load(file)

In [22]:
train_x_tfidf_elmo=[train_X_emb[i]+list(train_x_char[i]) for i in range(len(train_X_emb))]
test_x_tfidf_elmo=[test_X_emb[i]+list(test_x_char[i]) for i in range(len(test_X_emb))]

##### elmo embeddings

In [23]:
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(train_X_emb,y_train)
pred=SVM.predict(test_X_emb)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  88.42592592592592
F1 Score:  88.30325928076793
Precision:  88.33258874363663


##### tfidf + elmo embeddings

In [24]:
SVM = svm.SVC(C=1, kernel='linear')
SVM.fit(train_x_tfidf_elmo,y_train)
pred=SVM.predict(test_x_tfidf_elmo)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  88.42592592592592
F1 Score:  88.30325928076793
Precision:  88.33258874363663


## Logistic Regression

In [25]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 6000,stop_words='english',ngram_range = (1,6))

train_x_t = count_vec.fit_transform(X_train)
train_x = tfidf_transformer.fit_transform(train_x_t)
train_x_char=train_x.toarray()

testx_t=count_vec.transform(X_test)
test_x= tfidf_transformer.transform(testx_t)
test_x_char=test_x.toarray()

##### tfidf + char n-grams

In [26]:
LR = LogisticRegression(random_state=0)
LR.fit(train_x_char,y_train)
predictions_LR = LR.predict(test_x_char)
print("Accuracy: ",accuracy_score(y_test,predictions_LR)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy:  90.27777777777779
F1 Score:  88.30325928076793
Precision:  88.33258874363663


In [27]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 6000,stop_words='english',ngram_range = (1,2))

train_x_t = count_vec.fit_transform(X_train)
train_x_word = tfidf_transformer.fit_transform(train_x_t)

testx_t=count_vec.transform(X_test)
test_x_word= tfidf_transformer.transform(testx_t)

##### tfidf + word n-grams

In [28]:
LR = LogisticRegression(random_state=0)
LR.fit(train_x_word,y_train)
predictions_LR = LR.predict(test_x_word)
print("Accuracy: ",accuracy_score(y_test,predictions_LR)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy:  85.64814814814815
F1 Score:  88.30325928076793
Precision:  88.33258874363663


In [29]:
with open('trainx_emb_80%.json','r') as file:
    train_X_emb=json.load(file)
with open('testx_emb_20%.json','r') as file:
    test_X_emb=json.load(file)

In [30]:
train_x_tfidf_elmo=[train_X_emb[i]+list(train_x_char[i]) for i in range(len(train_X_emb))]
test_x_tfidf_elmo=[test_X_emb[i]+list(test_x_char[i]) for i in range(len(test_X_emb))]

##### elmo embeddings

In [31]:
LR = LogisticRegression(C=1, random_state=0, max_iter=600)
LR.fit(train_X_emb,y_train)
pred=LR.predict(test_X_emb)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  91.89814814814815
F1 Score:  91.90613234606465
Precision:  91.91522216265506


##### tfidf + elmo embeddings

In [32]:
LR = LogisticRegression(C=1, random_state=0, max_iter=600) 
LR.fit(train_x_tfidf_elmo,y_train)
pred=LR.predict(test_x_tfidf_elmo)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)
print("F1 Score: ",f1_score(pred,y_test, average="weighted")*100)
print("Precision: ",precision_score(pred, y_test, average="weighted")*100)

Accuracy Score:  91.89814814814815
F1 Score:  91.90613234606465
Precision:  91.91522216265506
