In [1]:
#Libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

In [2]:
def clean_dataset(X):
    #remove digits 
    #remove words less than 3 characters 
    #remove punctuation

    X['clean_title'] = X['title'].str.replace('\d+', ' ') # for digits
    X['clean_title'] = X['clean_title'].str.replace(r'(\b\w{1,2}\b)', ' ') # for words less than 3 characters
    X['clean_title'] = X['clean_title'].str.replace('[^\w\s]', ' ') # for punctuation 
    
    X['clean_text'] = X['text'].str.replace('\d+', ' ') # for digits
    X['clean_text'] = X['clean_text'].str.replace(r'(\b\w{1,2}\b)', ' ') # for words less than 3 characters
    X['clean_text'] = X['clean_text'].str.replace('[^\w\s]', ' ') # for punctuation 
    #lemmatization 
    X['clean_title'] = X['clean_title'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    X['clean_text'] = X['clean_text'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    return X
    
def split_dataset(X, y):
    #Split to train and test dataset 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000)

    #Reset all the index
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    return (X_train, X_test, y_train, y_test)

def count_vectorizer_train(train_data, ngram):
    #train title data
    countVec_title = CountVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range=ngram, strip_accents='ascii')
    vector_train_title = countVec_title.fit_transform(train_data['clean_title'].values.astype('U'))
    tokens_title = countVec_title.get_feature_names()
    vectorized_train_title = pd.DataFrame(vector_train_title.toarray(), columns=tokens_title)

    #train text data
    countVec_text = CountVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range= ngram, strip_accents='ascii')
    vector_train_text = countVec_text.fit_transform(train_data['clean_text'].values.astype('U'))
    tokens_text = countVec_text.get_feature_names()
    vectorized_train_text = pd.DataFrame(vector_train_text.toarray(), columns=tokens_text)

    #combine train data and test data features
    vectorized_train = pd.concat([vectorized_train_title, vectorized_train_text], axis=1)
   
    return vectorized_train, countVec_title, countVec_text, tokens_title, tokens_text


def count_vectorizer_test(test_data, countVec_title, countVec_text, tokens_title, tokens_text):
    #test title data - only transform
    vector_test_title = countVec_title.transform(test_data['clean_title'].values.astype('U'))
    vectorized_test_title = pd.DataFrame(vector_test_title.toarray(), columns=tokens_title)
    
    #test text data - only transform
    vector_test_text = countVec_text.transform(test_data['clean_text'].values.astype('U'))
    vectorized_test_text = pd.DataFrame(vector_test_text.toarray(), columns=tokens_text)
    
    #combine test title and text
    vectorized_test = pd.concat([vectorized_test_title, vectorized_test_text], axis=1)
    return vectorized_test

def tfidf_vectorizer_train(train_data, ngram):
    #train title data
    tfidfVec_title = TfidfVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range= ngram, strip_accents='ascii')
    vector_train_title = tfidfVec_title.fit_transform(train_data['clean_title'].values.astype('U'))
    tokens_title = tfidfVec_title.get_feature_names()
    vectorized_train_title = pd.DataFrame(vector_train_title.toarray(), columns=tokens_title)
    
    #train text data
    tfidfVec_text = TfidfVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range= ngram, strip_accents='ascii')
    vector_train_text = tfidfVec_text.fit_transform(train_data['clean_text'].values.astype('U'))
    tokens_text = tfidfVec_text.get_feature_names()
    vectorized_train_text = pd.DataFrame(vector_train_text.toarray(), columns=tokens_text)   
    
    #combine train data features
    vectorized_train = pd.concat([vectorized_train_title, vectorized_train_text], axis=1)
    
    return vectorized_train, tfidfVec_title, tfidfVec_text, tokens_title, tokens_text
    
def tfidf_vectorizer_test(test_data, tfidfVec_title, tfidfVec_text, tokens_title, tokens_text):
    #test title data - only transform
    vector_test_title = tfidfVec_title.transform(test_data['clean_title'].values.astype('U'))
    vectorized_test_title = pd.DataFrame(vector_test_title.toarray(), columns=tokens_title)
    
    #test text data - only transform
    vector_test_text = tfidfVec_text.transform(test_data['clean_text'].values.astype('U'))
    vectorized_test_text = pd.DataFrame(vector_test_text.toarray(), columns=tokens_text)
    
    #combine test title and text
    vectorized_test = pd.concat([vectorized_test_title, vectorized_test_text], axis=1)
    return vectorized_test  

In [3]:
data = pd.read_csv("newData_w_title.csv")

#Divide data into feaures and labels
X = data.loc[:,['title','text']]
X = clean_dataset(X)
y = pd.DataFrame(data['label'])

In [4]:
#Split the data into train and test 
X_train, X_test, y_train, y_test = split_dataset(X, y)

from sklearn.model_selection import GridSearchCV 
param_grid = {'C': [0.1,1, 10]} 
grid = GridSearchCV(LinearSVC(dual=False,max_iter=10000),param_grid,refit=True,verbose=2) 
grid.fit(train_data, y_train['label']) 
print(grid.best_estimator_)

In [5]:
#Tfidf Vectorize
ngram = (1,3)
train_data, tfidfVec_title, tfidfVec_text, tokens_title, tokens_text = tfidf_vectorizer_train(X_train, ngram)
test_data = tfidf_vectorizer_test(X_test, tfidfVec_title, tfidfVec_text, tokens_title, tokens_text)

In [None]:
svm = LinearSVC(C=1,dual=False,max_iter=10000)
clf0 = CalibratedClassifierCV(svm)
clf = OneVsRestClassifier(clf0)
clf.fit(train_data, y_train['label'])

In [None]:
yhat = clf.predict(test_data)

In [None]:
print('Accuracy: %.2f' %(accuracy_score(y_test['label'],yhat)*100))
print('Precision: %.2f' %(precision_score(y_test['label'],yhat)*100))
print('F1 Score: %.2f' %(f1_score(y_test['label'],yhat)*100))
print('Recall: %.2f' %(recall_score(y_test['label'],yhat)*100))

In [None]:
pickle.dump(tfidfVec_title, open("tfidf_1-3_vector_title.sav", "wb"))
pickle.dump(tfidfVec_text, open("tfidf_1-3_vector_text.sav", "wb"))
pickle.dump(clf,open('SVM_tfidf_1-3.sav','wb'))