In [1]:
import os
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, numpy, string

In [2]:
path = "D:/Program Files/Jupyter/NLP/NLP_raw data"
folders = os.listdir(path)
labels, texts = [], []
for folder in folders:
    curr_path = path +"/"+folder
    files = os.listdir(curr_path)
    for file in files:
        f = open(curr_path+"/"+file)
        iter_f = iter(f)
        indic = 0
        str=""
        for line in iter_f:
            line = line.strip()
            if line:
                line = line.lower()
                str = str+line+" "
                indic = indic+1
                # store every five lines as one document
                if indic == 5:
                    labels.append(folder)
                    texts.append(str)
                    str=""
                    indic=0

In [3]:
#create a dataframe using texts and labels
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label']=labels

In [4]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2, shuffle=True)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [5]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [6]:
# Naive Bayes Classifier
nb_classifier = naive_bayes.MultinomialNB()
nb_classifier.fit(xtrain_tfidf, train_y)
nb_predictions = nb_classifier.predict(xvalid_tfidf)
nb_accuracy = metrics.accuracy_score(nb_predictions, valid_y)
print("Accuracy of NB for word level tfidf: ", nb_accuracy)

Accuracy of NB for word level tfidf:  0.810246394231


In [13]:
# Logistic Regression Classifier
logic_classifier = linear_model.LogisticRegression()
logic_classifier.fit(xtrain_tfidf, train_y)
logic_predictions = logic_classifier.predict(xvalid_tfidf)
logic_accuracy = metrics.accuracy_score(logic_predictions, valid_y)
print("Accuracy of logistic regression on word level tfidf: ", logic_accuracy)

Accuracy of logistic regression on word level tfidf:  0.840369591346


In [14]:
# Bagging (Random Forest)
rf_classifier = ensemble.RandomForestClassifier()
rf_classifier.fit(xtrain_tfidf, train_y)
rf_predictions = rf_classifier.predict(xvalid_tfidf)
rf_accuracy = metrics.accuracy_score(rf_predictions, valid_y)
print("Accuracy of random forest for word level tfidf: ", rf_accuracy)

Accuracy of random forest for word level tfidf:  0.687349759615


In [9]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1,2), max_features=8000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [10]:
# Naive Bayes Classifier for ngram level tf-idf 
nb_classifier = naive_bayes.MultinomialNB()
nb_classifier.fit(xtrain_tfidf_ngram, train_y)
nb_predictions = nb_classifier.predict(xvalid_tfidf_ngram)
nb_accuracy = metrics.accuracy_score(nb_predictions, valid_y)
print("Accuracy of NB on ngram level tfidf" , nb_accuracy)

Accuracy of NB on ngram level tfidf 0.827674278846


In [11]:
# Logistic Regression Classifier for ngram level tf-idf 
logic_classifier = linear_model.LogisticRegression()
logic_classifier.fit(xtrain_tfidf_ngram, train_y)
logic_predictions = logic_classifier.predict(xvalid_tfidf_ngram)
logic_accuracy = metrics.accuracy_score(logic_predictions, valid_y)
print("Accuracy of logistic regression on ngram level tfidf", logic_accuracy)

Accuracy of logistic regression on ngram level tfidf 0.860652043269


In [12]:
# Bagging (Random Forest)
rf_classifier = ensemble.RandomForestClassifier()
rf_classifier.fit(xtrain_tfidf_ngram, train_y)
rf_predictions = rf_classifier.predict(xvalid_tfidf_ngram)
rf_accuracy = metrics.accuracy_score(rf_predictions, valid_y)
print("Accuracy of random forest on ngram level tfidf", rf_accuracy)

Accuracy of random forest on ngram level tfidf 0.691105769231


In [3]:
"""Implementation of SVM """
# SVM would use lot of computational resources if we don't trim features


In [94]:
own_dictionary=[]
def feature_extraction():
    # a) Read all the books that belong to one author into a single string
    path = "D:/Program Files/Jupyter/NLP/NLP_raw data"
    folders = os.listdir(path)
    labels, texts = [], []
    for folder in folders:
        curr_path = path +"/"+folder
        files = os.listdir(curr_path)
        string=""
        indic =0
        for file in files:
            indic=indic+1
            if indic==3:
                break;
            f = open(curr_path+"/"+file)
            iter_f = iter(f)
            for line in iter_f:
                line = line.strip()
                if line:
                    line = line.lower()
                    string = string+line+" "
        texts.append(string)
        labels.append(folder)
    # b) select the most important features( features exists in at least four categories
# and frequency gap between the largest one and the fourth largest one to be more 
# than 0.02
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vect.fit_transform(texts)
features = tfidf_vect.vocabulary_    
    d={}
    for i in range(10):
        for word in features:
            tfidf_w = tfidf[i,features[word]]
            if tfidf_w !=0:
                if word in d:
                    d[word] = numpy.append(d[word], tfidf_w)
                else:
                    d[word] = numpy.array(tfidf_w)
    for word in d:
        if d[word].size >= 4:
            index = numpy.argsort(d[word])
            if float(d[word][index[d[word].size-1]])-float(d[word][index[d[word].size-4]])>0.02:
                own_dictionary.append(word)
feature_extraction()

In [95]:
print(len(own_dictionary))

324


In [96]:
tfidf_vect_svm = TfidfVectorizer(stop_words='english', vocabulary=own_dictionary)

In [97]:
tfidf_vect_svm.fit(texts)
xtrain_tfidf =  tfidf_vect_svm.transform(train_x)
xvalid_tfidf =  tfidf_vect_svm.transform(valid_x)

In [98]:
#train the svm model based on tfidf o
svm_classifier = svm.SVC()
svm_classifier.fit(xtrain_tfidf, train_y)
svm_predictions = svm_classifier.predict(xvalid_tfidf)
svm_accuracy = metrics.accuracy_score(svm_predictions, valid_y)
print("Accuracy of SVM on word level tfidf: ", svm_accuracy)

Accuracy of SVM on word level tfidf:  0.327824519231


In [68]:
print(xtrain_tfidf.shape)

(53246, 640)
