In [1]:
from sklearn import svm
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
import nltk
import pickle
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

In [3]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def normalize_text(words):
    words = to_lowercase(words)
    return words

In [4]:
def tokenize(text):
    return nltk.word_tokenize(text)

In [5]:
def text_prepare(text):
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text

In [6]:
df=pd.read_csv('sp+Ip+sn+In.csv')
df['text'] = [text_prepare(x) for x in df['text']]
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [7]:
data = df.text
data_label = df.label

In [None]:
# data=pd.read_csv('sp+Ip+sn+In.csv')
# data_label = data['label']
# # del data['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, data_label, test_size=0.2, random_state=0)

In [None]:
# pickle.dump( X_train, open( "X_train.p", "wb" ) )
# pickle.dump( X_test, open( "X_test.p", "wb" ) )
# pickle.dump( y_train, open( "y_train.p", "wb" ) )
# pickle.dump( y_test, open( "y_test.p", "wb" ) )

In [9]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

In [None]:
# pickle.dump( X_train, open( "X_train.p", "wb" ) )
# pickle.dump( X_test, open( "X_test.p", "wb" ) )
# pickle.dump( y_train, open( "y_train.p", "wb" ) )
# pickle.dump( y_test, open( "y_test.p", "wb" ) )

In [None]:
# X_train = pickle.load( open( "X_train.p", "rb" ) )
# y_train = pickle.load( open( "y_train.p", "rb" ) )
# X_test = pickle.load( open( "X_test.p", "rb" ) )
# y_test = pickle.load( open( "y_test.p", "rb" ) )

In [10]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 10000,stop_words='english',ngram_range = (1,8))

trainx_t = count_vec.fit_transform(X_train)
testx_t=count_vec.transform(X_test)

train_x = tfidf_transformer.fit_transform(trainx_t)
test_x= tfidf_transformer.transform(testx_t)

train_x_char=train_x.toarray()
test_x_char=test_x.toarray()

##### tfidf + character n-grams

In [11]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_char,y_train)
pred=Random_Forest.predict(test_x_char)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  93.05555555555556


In [12]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english',ngram_range = (1,2))

trainx_t = count_vec.fit_transform(X_train)
testx_t=count_vec.transform(X_test)

train_x_word = tfidf_transformer.fit_transform(trainx_t)
test_x_word= tfidf_transformer.transform(testx_t)

##### tfidf + word n-grams

In [13]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_word,y_train)
pred=Random_Forest.predict(test_x_word)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  86.57407407407408


In [14]:
with open('trainx_emb_80%.json','r') as file:
    Train_X_emb=json.load(file)
with open('testx_emb_20%.json','r') as file:
    Test_X_emb=json.load(file)

In [15]:
train_x_tfidf_elmo=[Train_X_emb[i]+list(train_x_char[i]) for i in range(len(Train_X_emb))]
test_x_tfidf_elmo=[Test_X_emb[i]+list(test_x_char[i]) for i in range(len(Test_X_emb))]

##### elmo embeddings

In [16]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(Train_X_emb,y_train)
pred=Random_Forest.predict(Test_X_emb)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  89.58333333333334


##### tfidf + elmo embeddings

In [17]:
Random_Forest = RandomForestClassifier(n_estimators=800, random_state=0, class_weight='balanced') 
Random_Forest.fit(train_x_tfidf_elmo,y_train)
pred=Random_Forest.predict(test_x_tfidf_elmo)
print("Accuracy Score: ",accuracy_score(pred,y_test)*100)

Accuracy Score:  91.20370370370371
