In [2]:
import pandas as pd
import numpy as np
import scipy
import math
from sklearn import tree
import pickle
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import collections
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.svm import SVC

In [3]:
bookrating_train = pd.read_csv('book_rating_train.csv')
bookrating_test = pd.read_csv('book_rating_test.csv')

2 MODELS: 1st will be a classifier model using DT
          2nd will be a cluster model using kNN

In [4]:
vocab = pickle.load(open("train_name_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_

train_name_vec = scipy.sparse.load_npz('train_name_vec.npz')
test_name_vec = scipy.sparse.load_npz('test_name_vec.npz')

train_auth_vec = scipy.sparse.load_npz('train_authors_vec.npz')
test_auth_vec = scipy.sparse.load_npz('test_authors_vec.npz')

train_desc_vec = scipy.sparse.load_npz('train_desc_vec.npz')
test_desc_vec = scipy.sparse.load_npz('test_desc_vec.npz')

In [4]:
subtrain_X, subtest_X, subtrain_Y, subtest_Y = train_test_split(train_auth_vec, bookrating_train['rating_label'], test_size = 0.2) 

In [None]:
DTC = tree.DecisionTreeClassifier()
mtree = DTC.fit(subtrain_X, subtrain_Y)
#print(classification_report(subtest_Y, mtree.predict(subtest_X), labels=[3.0,4.0,5.0]))
print(cross_val_score(DTC, train_name_vec, bookrating_train['rating_label'], cv = 5, scoring ='f1_macro').mean())

In [5]:
def dtree_tuning(vector, dataset):
    
    fold = 5 # No of times to train in each repetition to get the average
    
    depths= [None,2,3,4,5]
    depth_results = {}
    params = {}
    
    for val in depths:
        
        DTC = tree.DecisionTreeClassifier(max_depth = val)
        depth_results[val] = cross_val_score(DTC, vector,dataset['rating_label'], cv = fold, scoring ='f1_macro').mean()
 
    params['depth'] = max(depth_results, key=depth_results.get)
    
    leaf_nodes = [None,5,6,7,8,9,10]
    leafnode_results = {}
    
    for val in leaf_nodes:
        DTC = tree.DecisionTreeClassifier(max_depth = params['depth'], max_leaf_nodes = val)
        leafnode_results[val] = cross_val_score(DTC, vector,dataset['rating_label'], cv = fold, scoring ='f1_macro').mean()
    
    params['max_leaf_node'] = max(leafnode_results, key=leafnode_results.get)
  
    
    return params

In [18]:
name_params = dtree_tuning(train_name_vec, bookrating_train)
auth_params = dtree_tuning(train_auth_vec, bookrating_train)
desc_params = dtree_tuning(train_desc_vec, bookrating_train)

print('name params =', name_params)
print('auth_params =', auth_params)
print('desc params =', desc_params)

name params = {'depth': None, 'max_leaf_node': None}
auth_params = {'depth': None, 'max_leaf_node': None}
desc params = {'depth': None, 'max_leaf_node': None}


In [6]:
def majority_voting(list1, list2, list3):
    
    # Make the best list list3
    
    predictions = []
    
    for i in range(len(list1)):
        current_votes = []
        current_votes.append(list1[i])
        current_votes.append(list2[i])
        current_votes.append(list3[i])
        
        vote_counts = collections.Counter(current_votes)
        
        if len(vote_counts) == 3:
            predictions.append(list3[i])
        else:
            predictions.append(max(vote_counts, key=vote_counts.get))
        
    return predictions            

In [37]:
train_name_X, test_name_X, train_name_Y, test_name_Y = train_test_split(train_name_vec, bookrating_train['rating_label'], test_size = 0.2, random_state = 3)
train_auth_X, test_auth_X, train_auth_Y, test_auth_Y = train_test_split(train_auth_vec, bookrating_train['rating_label'], test_size = 0.2, random_state = 3)
train_desc_X, test_desc_X, train_desc_Y, test_desc_Y = train_test_split(train_desc_vec, bookrating_train['rating_label'], test_size = 0.2, random_state = 3)


In [38]:
DTC =tree.DecisionTreeClassifier(max_depth = 3, max_leaf_nodes = 7)
name_model = DTC.fit(train_name_X, train_name_Y)
name_preds = name_model.predict(test_name_X)

DTC =tree.DecisionTreeClassifier(max_depth = 5, max_leaf_nodes = None)
auth_model = DTC.fit(train_auth_X, train_auth_Y)
auth_preds = auth_model.predict(test_auth_X)

DTC =tree.DecisionTreeClassifier(max_depth = 3, max_leaf_nodes = 5)
desc_model = DTC.fit(train_desc_X, train_auth_Y)
desc_preds = desc_model.predict(test_desc_X)

preds = majority_voting(name_preds, desc_preds, auth_preds)

print('voted accuracy = ', accuracy_score(preds, test_name_Y))
print('name acc = ', accuracy_score(name_preds, test_name_Y))
print('auth acc =', accuracy_score(auth_preds, test_auth_Y))
print('desc acc = ', accuracy_score(desc_preds, test_desc_Y))       

voted accuracy =  0.7066984608714503
name acc =  0.7090830262302189
auth acc = 0.7095165835681769
desc acc =  0.7064816822024713


In [39]:
#majority_voting(name_preds, auth_preds, desc_preds)
# for i in range(len(preds)):
#     if preds[i] != 4.0:
#         print(preds[i],',', test_name_Y[i])
print(collections.Counter(preds))
print(collections.Counter(test_name_Y))

Counter({4.0: 4609, 5.0: 3, 3.0: 1})
Counter({4.0: 3256, 3.0: 1165, 5.0: 192})


In [15]:
SVM = SVC()
subtrain_X, subtest_X, subtrain_Y, subtest_Y = train_test_split(train_auth_vec, bookrating_train['rating_label'], test_size = 0.2) 

In [16]:
model = SVM.fit(subtrain_X, subtrain_Y)

In [17]:
predis = model.predict(subtest_X)
print(classification_report(subtest_Y,predis))
print(accuracy_score(subtest_Y, predis))

              precision    recall  f1-score   support

         3.0       0.59      0.11      0.18      1203
         4.0       0.71      0.97      0.82      3176
         5.0       0.67      0.08      0.14       234

    accuracy                           0.70      4613
   macro avg       0.65      0.38      0.38      4613
weighted avg       0.67      0.70      0.62      4613

0.6995447647951442


In [16]:
SVM = SVC(C = 1.3, kernel = 'poly', degree = 4)
print(cross_val_score(SVM, train_auth_vec ,bookrating_train['rating_label'], cv = 5, scoring ='accuracy').mean())

0.7215886830630055


In [23]:
def svm_tuning(vector, dataset):
    
    fold = 5 # No of times to train in each repetition to get the average
    
    c_vals= [0.8,0.9,1.0,1.2,1.4,1.6]
    c_results = {}
    params = {}
    
    for c in c_vals:
        
        SVM = SVC(C = c)
        c_results[c] = cross_val_score(SVM, vector,dataset['rating_label'], cv = fold, scoring ='f1_macro').mean()
 
    params['C'] = max(c_results, key=c_results.get)
    
    kernel = ['linear', 'poly', 'rbf', 'sigmoid']
    kernel_results = {}
    
    for kernel in kernel:
        SVM = SVC(C = params['C'], kernel = kernel)
        kernel_results[kernel] = cross_val_score(SVM, vector,dataset['rating_label'], cv = fold, scoring ='f1_macro').mean()
    
    params['kernel'] = max(kernel_results, key=kernel_results.get)
    degree_exists = 0
    if params['kernel'] == 'poly':
        
        degrees = [1,2,3,4,5,6]
        degree_results = {}
        
        for degree in degrees:
            SVM = SVC(C = params['C'], kernel = params['kernel'], degree = degree)
            degree_results[degree] = cross_val_score(SVM, vector,dataset['rating_label'], cv = fold, scoring ='f1_macro').mean()
       
        params['degree'] = max(degree_results, key=degree_results.get)
        degree_exists = 1
    
    weights = [None, 'balanced']
    weight_results = {}
        
    for weight in weights:
        
        if degree_exists:
            degree = params['degree']
        else:
            degree = 3            
        SVM = SVC(C = params['C'], kernel = params['kernel'], degree = degree, class_weight = weight)
        weight_results[weight] = cross_val_score(SVM, vector,dataset['rating_label'], cv = fold, scoring ='f1_macro').mean()
    
    params['weight'] = max(weight_results, key=weight_results.get)

    return params

In [None]:
# name_params = svm_tuning(train_name_vec, bookrating_train)
# auth_params = svm_tuning(train_auth_vec, bookrating_train)
# desc_params = svm_tuning(train_desc_vec, bookrating_train)

# print('name params =', name_params)
# print('auth_params =', auth_params)
# print('desc params =', desc_params)

In [22]:
SVM = SVC(C=1.6,kernel='poly',degree=4,class_weight='balanced')
my_model = SVM.fit(train_auth_vec, bookrating_train['rating_label'])
predictions = my_model.predict(test_auth_vec)
headers =['id','rating_label']
ids = list(range(1,5767))

In [23]:
my_dict = {'id' : ids, 'rating_label': predictions}
dframe = pd.DataFrame(my_dict)

In [32]:
dframe.to_csv('predictions.csv',index=False)

In [35]:
df = pd.read_csv("predictions.csv")

df["rating_label"] = df["rating_label"].astype(str)

df.to_csv("predictions.csv", index=False)

In [9]:
DTC = tree.DecisionTreeClassifier(max_depth=5, max_leaf_nodes=None)
dtModel = DTC.fit(train_auth_vec, bookrating_train['rating_label'])
preds = dtModel.predict(test_auth_vec)
ids =list(range(1,5767))

In [10]:
my_dict = {'id' : ids, 'rating_label': preds}
df = pd.DataFrame(my_dict)
df.to_csv('DT_predictions.csv',index=False)