<a href="https://colab.research.google.com/github/amazingashis/Machine_Learning_Advanced/blob/main/ResearchPaperClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd 
import sklearn
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import tree
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2

from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support


import gensim, logging
from gensim.models import Word2Vec
from scipy import sparse

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def loadData(filePath="/content/drive/MyDrive/Data/dataset.csv"):
    data = pd.read_csv(filePath, header=0)
    return data["Title"],data["Conference"]

def preProcessing(features):
    num_titles = features.size
    clean_wordlist = []
    clean_titles = []
    stops = set(stopwords.words('english'))
    for i in range( 0, num_titles):
        #letters_only = re.sub("[^a-zA-Z]", " ", features[i]) 
        words = features[i].lower().split()
        words = [w.lower() for w in words if not w in stops]  
        clean_wordlist.append(words)
        clean_titles.append(" ".join(words))
    return clean_titles, clean_wordlist

In [4]:
def getDTMByTFIDF(features,nfeatures):
    tfIdf_vectorizer = TfidfVectorizer(max_features=nfeatures)
    dtm = tfIdf_vectorizer.fit_transform(features).toarray()
    return dtm,tfIdf_vectorizer

In [5]:
def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

def featuresByInformationGain(features,labels):
    treeCL = tree.DecisionTreeClassifier(criterion="entropy")
    treeCL = treeCL.fit(features,labels)
    transformed_features = SelectFromModel(treeCL,prefit=True).transform(features)
    return transformed_features

def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa

In [6]:
def makeFeatureVec(words, model, num_features):
    feature_vec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model[word]) 

    feature_vec = np.divide(feature_vec,nwords)
   
    return feature_vec

def getAvgFeatureVecs(title, model, num_features):
    counter = 0
    titleFeatureVecs = np.zeros((len(title), num_features),dtype="float32")
    for t in title:
        titleFeatureVecs[counter] = makeFeatureVec(t, model,num_features)
        counter = counter + 1
    return titleFeatureVecs

In [7]:
def crossValidate(document_term_matrix,labels,classifier,nfold=2):
    clf = None
    precision = []
    recall = []
    fscore = []
    
    if classifier == "RF":
        clf = RandomForestClassifier()
    elif classifier == "NB":
        clf = MultinomialNB()
    elif classifier == "SVM":
        clf = LinearSVC()
    
    skf = StratifiedKFold(n_splits=nfold)
    skf.get_n_splits(document_term_matrix, labels)

    for train_index, test_index in skf.split(document_term_matrix, labels):
        X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = clf.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precision.append(p)
        recall.append(r)
        fscore.append(f)
        
    return np.mean(precision),np.mean(recall),np.mean(fscore)

In [8]:
!pip install nltk




In [9]:
from nltk.corpus import stopwords

In [10]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
titles, labels = loadData()
processed_titles, processed_titles_wordlist = preProcessing(titles)
dtm,vect = getDTMByTFIDF(processed_titles,None)

In [12]:
chisqDtm, chisqModel = featuresByChiSq(dtm,labels,2000)
igDtm = featuresByInformationGain(dtm,labels)
lsaDtm = featuresByLSA(dtm,100)

In [13]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 1    # Minimum word count                        
num_workers = 1       # Number of threads to run in parallel
context = 8           # Context window size                                                                                    
downsampling = 1e-5   # Downsample setting for frequent words

word2vec_model = Word2Vec(processed_titles_wordlist, workers=num_workers, 
            size=num_features, min_count = min_word_count, 
            window = context, sample = downsampling)
word2vec_model.init_sims(replace=True)

In [14]:
wordVecs = getAvgFeatureVecs(processed_titles_wordlist, word2vec_model, num_features)

  


In [15]:
#Combine features from chiSq and word2Vec
combinedFeatures = np.hstack([chisqDtm,wordVecs])

In [16]:
precision, recall, fscore = crossValidate(chisqDtm,labels,"SVM",10)
print("ChiSq Features: for SVM",precision, recall, fscore)

ChiSq Features: 0.8089088432134339 0.7997641434262948 0.793880149041286


In [17]:
precision, recall, fscore = crossValidate(combinedFeatures,labels,"SVM",10)
print ("ChiSq Features for SVM:",precision, recall, fscore)

ChiSq Features: 0.7965052170945573 0.787800796812749 0.7825461095083039


In [18]:
precision, recall, fscore = crossValidate(igDtm,labels,"SVM",10)
print ("Features By InformationGain for SVM:",precision, recall, fscore)

featuresByInformationGain: 0.7365104399728638 0.7347776892430279 0.727798035725409


In [19]:
precision, recall, fscore = crossValidate(lsaDtm,labels,"SVM",10)
print ("Features By LSA for SVM",precision, recall, fscore)

Features By LSA 0.7367948984107847 0.7371314741035857 0.7338410722101127


In [20]:
precision, recall, fscore = crossValidate(lsaDtm,labels,"RF",10)
print ("Features By LSA for Random Forest",precision, recall, fscore)

Features By LSA for Random Forest 0.7006823664830917 0.6996302788844622 0.6922297375990426


In [21]:
precision, recall, fscore = crossValidate(igDtm,labels,"RF",10)
print ("Features By InformationGain for Random Forest:",precision, recall, fscore)

Features By InformationGain for Random Forest: 0.7304839945249499 0.7176 0.71848275016834


In [22]:
precision, recall, fscore = crossValidate(chisqDtm,labels,"RF",10)
print("ChiSq Features: for  Random Forest",precision, recall, fscore)

ChiSq Features: for  Random Forest 0.7344008003856545 0.7008509960159363 0.6883783967140573


In [23]:
precision, recall, fscore = crossValidate(igDtm,labels,"NB",10)
print ("Features By InformationGain for Nave Bayes:",precision, recall, fscore)

Features By InformationGain for Nave Bayes: 0.7128171358913977 0.6234645418326694 0.5898090189345938


In [24]:
precision, recall, fscore = crossValidate(chisqDtm,labels,"NB",10)
print("ChiSq Features: for  Nave Bayes",precision, recall, fscore)

ChiSq Features: for  Nave Bayes 0.7707542103728363 0.7024302788844621 0.6783470032212775
