In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import json
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('punkt')
import pickle
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def normalize_text(words):
    words = to_lowercase(words)
    return words

In [5]:
def tokenize(text):
    return nltk.word_tokenize(text)

In [6]:
def text_prepare(text):
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text

In [8]:
df=pd.read_csv("/content/gdrive/My Drive/data_set/sp+Ip+sn+In.csv")
df['text'] = [text_prepare(x) for x in df['text']]
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [9]:
data = df.text
data_label = df.label

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data, data_label, test_size=0.2, random_state=0)

In [11]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

###tfidf char ngrams

In [12]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="char",max_features = 10000,stop_words='english',ngram_range = (1,8))

trainx_t = count_vec.fit_transform(X_train)
testx_t=count_vec.transform(X_test)

train_x = tfidf_transformer.fit_transform(trainx_t)
test_x= tfidf_transformer.transform(testx_t)

train_x_char=train_x.toarray()
test_x_char=test_x.toarray()

###tfidf(char)+svm

In [13]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',class_weight='balanced')
SVM.fit(train_x_char,y_train)
pred=SVM.predict(test_x_char)
print("Tfidf(char)+SVM Accuracy Score -> ",accuracy_score(pred,y_test)*100)


Tfidf(char)+SVM Accuracy Score ->  91.66666666666666


###tfidf word ngrams

In [14]:
tfidf_transformer = TfidfTransformer(norm = 'l2')
count_vec = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english',ngram_range = (1,2))
trainx_t = count_vec.fit_transform(X_train)
testx_t=count_vec.transform(X_test)
train_x_word = tfidf_transformer.fit_transform(trainx_t)
test_x_word= tfidf_transformer.transform(testx_t)

###tfidf(word)+svm

In [15]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',class_weight='balanced')
SVM.fit(train_x_word,y_train)
pred=SVM.predict(test_x_word)
print("Tfidf(word)+SVM Accuracy Score -> ",accuracy_score(pred,y_test)*100)

Tfidf(word)+SVM Accuracy Score ->  84.25925925925925


In [17]:
with open('/content/gdrive/My Drive/data/trainx_emb_80%.json','r') as file:
    Train_X_emb=json.load(file)
with open('/content/gdrive/My Drive/data/testx_emb_20%.json','r') as file:
    Test_X_emb=json.load(file)

In [18]:
train_x_tfidf_elmo=[Train_X_emb[i]+list(train_x_char[i]) for i in range(len(Train_X_emb))]
test_x_tfidf_elmo=[Test_X_emb[i]+list(test_x_char[i]) for i in range(len(Test_X_emb))]

####elmo+svm

In [19]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',class_weight='balanced')
SVM.fit(Train_X_emb,y_train)
pred=SVM.predict(Test_X_emb)
print("Elmo+SVM Accuracy Score -> ",accuracy_score(pred,y_test)*100)


Elmo+SVM Accuracy Score ->  88.42592592592592


###elmo+tfidf+svm

In [20]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',class_weight='balanced')
SVM.fit(train_x_tfidf_elmo,y_train)
pred=SVM.predict(test_x_tfidf_elmo)
print("Elmo+tfidf+SVM Accuracy Score -> ",accuracy_score(pred,y_test)*100)

Elmo+tfidf+SVM Accuracy Score ->  88.6574074074074
