In [1]:
import nltk
import json
import pickle
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [2]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def normalize_text(words):
    words = to_lowercase(words)
    return words

In [3]:
def tokenize(text):
    return nltk.word_tokenize(text)

In [4]:
def text_prepare(text):
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text

In [5]:
df=pd.read_csv('sp+Ip+sn+In.csv')
df['text'] = [text_prepare(x) for x in df['text']]
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [6]:
data = df.text
data_label = df.label

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, data_label, test_size=0.2, random_state=0)

In [8]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

## Random Forest

In [9]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 10000,stop_words='english',ngram_range = (1,8))

train_x_t = count_vec.fit_transform(X_train)
train_x = tfidf_transformer.fit_transform(train_x_t)
train_x_char=train_x.toarray()

testx_t=count_vec.transform(X_test)
test_x= tfidf_transformer.transform(testx_t)
test_x_char=test_x.toarray()

##### tfidf + character n-grams

In [10]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_char,y_train)
pred=Random_Forest.predict(test_x_char)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  93.05555555555556


In [11]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english',ngram_range = (1,2))

train_x_t = count_vec.fit_transform(X_train)
train_x_word = tfidf_transformer.fit_transform(train_x_t)

testx_t=count_vec.transform(X_test)
test_x_word= tfidf_transformer.transform(testx_t)

##### tfidf + word n-grams

In [12]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_word,y_train)
pred=Random_Forest.predict(test_x_word)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  86.57407407407408


In [13]:
with open('trainx_emb_80%.json','r') as file:
    train_X_emb=json.load(file)
with open('testx_emb_20%.json','r') as file:
    test_X_emb=json.load(file)

In [14]:
train_x_tfidf_elmo=[train_X_emb[i]+list(train_x_char[i]) for i in range(len(train_X_emb))]
test_x_tfidf_elmo=[test_X_emb[i]+list(test_x_char[i]) for i in range(len(test_X_emb))]

##### elmo embeddings

In [15]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_X_emb,y_train)
pred=Random_Forest.predict(test_X_emb)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  89.58333333333334


##### tfidf + elmo embeddings

In [16]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_tfidf_elmo,y_train)
pred=Random_Forest.predict(test_x_tfidf_elmo)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  91.20370370370371


## Logistic Regression

In [17]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 10000,stop_words='english',ngram_range = (1,8))

train_x_t = count_vec.fit_transform(X_train)
train_x = tfidf_transformer.fit_transform(train_x_t)
train_x_char=train_x.toarray()

testx_t=count_vec.transform(X_test)
test_x= tfidf_transformer.transform(testx_t)
test_x_char=test_x.toarray()

##### tfidf + char n-grams

In [18]:
LR = LogisticRegression(random_state=0)
LR.fit(train_x_char,y_train)
predictions_LR = LR.predict(test_x_char)
print("Accuracy -> ",accuracy_score(y_test,predictions_LR)*100)

Accuracy ->  91.43518518518519


In [19]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english',ngram_range = (1,2))

train_x_t = count_vec.fit_transform(X_train)
train_x_word = tfidf_transformer.fit_transform(train_x_t)

testx_t=count_vec.transform(X_test)
test_x_word= tfidf_transformer.transform(testx_t)

##### tfidf + word n-grams

In [20]:
LR = LogisticRegression(random_state=0)
LR.fit(train_x_word,y_train)
predictions_LR = LR.predict(test_x_word)
print("Accuracy -> ",accuracy_score(y_test,predictions_LR)*100)

Accuracy ->  84.95370370370371


##### elmo embeddings

In [21]:
LR = LogisticRegression(random_state=0, max_iter=600, C=0.5)
LR.fit(train_X_emb,y_train)
pred=LR.predict(test_X_emb)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  92.5925925925926


##### tfidf + elmo embeddings

In [None]:
LR = LogisticRegression(random_state=0, max_iter=800, C=0.5) 
LR.fit(train_x_tfidf_elmo,y_train)
pred=LR.predict(test_x_tfidf_elmo)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)