In [None]:
# 1. Build vectorizers of six datasets.
import os

#--------------------------- Tf-idf weight function --------------------------- 
def build_vectorizer(input_file, output_vectorizer_file):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    import numpy as np
    import pickle, datetime
    
    # param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
    # param_grid = {'C': [1] }

    # train vectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, ngram_range=(1, 2), stop_words='english')
    vectorizer.fit(input_file)
    
    with open(output_vectorizer_file, 'wb') as f:
        pickle.dump(vectorizer, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    print("Vectorizer for {} is saved as {}".format(input_file, output_vectorizer_file))
    
    return None
#-----------------------------------------------------------------------------


#---------------------- Build Tf-idf weight maxtrix --------------------------
base_dir  = "./TODO_type_data_directory/"
for root, dirs, files in os.walk(base_dir, topdown=False):
    for name in files:
        if name.endswith('train.txt'):
            print(os.path.join(root, name))
            input_file = os.path.join(root, name)
            labels, texts, label_dict = get_tag_and_training_data(input_file)

            output_file = './model/'+name.replace('.txt', '.vectorizer')
            build_vectorizer(texts, output_file)
#-----------------------------------------------------------------------------

In [2]:
# Import libraries and define a unitility function for input data.

import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import accuracy_score

import pickle
        
RANDOM_SEED = 2212
np.random.seed(RANDOM_SEED)
# Make code reproduce same result.

def get_tag_and_training_data(filename, label_dict=False):
    '''
    takes the input file and returns  tokenized sentences and document tags as separate lists
    How to use:
    Y, X, label_list = get_tag_and_training_data(".../train.txt")
    '''
    original_labels = list()
    initial_label_dict   = {"label2index":{}, "index2label":{}}
    input_label = label_dict
    texts, labels = list(), list()

    with open(filename, encoding='utf8') as f:
        for line in f:
            #Initialize the token list for line
            words  = line.split()
            label = words[0]

            # Original labels
            original_labels.append(label)

            if not input_label:
                label_dict = initial_label_dict
                if label in label_dict["label2index"]: pass
                else :
                    index = len(label_dict["label2index"])
                    label_dict["label2index"][label]= index
                    label_dict["index2label"][index] = label
                    print(index, label, label_dict)

            label_idx = label_dict["label2index"][label]
            labels.append(label_idx)
            sent  = ' '.join(words[1:])
            texts.append(sent)

    return labels, texts, label_dict

In [8]:
instance_list = [10, 50, 100, 200, 500, 1000, 2000, ""]
# instance_list [""] is using 100% train data


target_list = ['agnews', 'dbpedia', 'amazon_six', 'tripadvisor', 'yahoo_a', 'yelp_f']
for target in target_list:
    
    train_file_path_list = []
    for instance in instance_list:
        train_file_path = "/home/sung_min/experiment_thesis/public_data/Target_"+target+"/"+target+"_train{}.txt".format(instance)
        train_file_path_list.append(train_file_path)

    test_file_path  = "/home/sung_min/experiment_thesis/public_data/Target_"+target+"/"+target+"_test.txt"


    for train_file_path, instance in zip(train_file_path_list, instance_list):

        title = target+'.report'

        y_train, X_train, label_dict = get_tag_and_training_data(train_file_path, label_dict=False)
        y_test,  X_test,  label_dict = get_tag_and_training_data(test_file_path, label_dict=label_dict)




        loaded_vectorizer = True

        if loaded_vectorizer:
            vectorizer = None
            with open('./model/{}_train.vectorizer'.format(target), 'rb') as f:
                vectorizer = pickle.load(f)
        else :
            # train vectorizer
            vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, ngram_range=(1, 2), stop_words='english')
            vectorizer.fit(X_train)


        X_train = vectorizer.transform(X_train)
        X_test = vectorizer.transform(X_test)
        logistic_clf = LogisticRegression(penalty='l2', random_state=RANDOM_SEED)
        logistic_clf.fit(X_train, y_train)
        y_pred = logistic_clf.predict(X_test)

        # Convert index to label
        y_true = [label_dict["index2label"][idx] for idx in y_test]
        y_pred = [label_dict["index2label"][idx] for idx in y_pred]
        
        # description += "Sample of Y_test : {}, \n Y_pred : {}\n".format(y_true[:10], y_pred[:10])
        description += "\ntarget : {}, instance per class : {}\n\n".format(target, instance if instance != "" else "total")
        
        txt1 = '\n\n\nDescription : '+ str(description) + '\n'
        txt2 = "*"*30 + title +"*"*30 + '\n'
        txt3 = str(metrics.classification_report(y_true, y_pred)) + '\n'
        txt4 = "Accuracy : "+ str(accuracy_score(y_true, y_pred)) + '\n'

0 SPORTS {'label2index': {'SPORTS': 0}, 'index2label': {0: 'SPORTS'}}
1 WORLD {'label2index': {'SPORTS': 0, 'WORLD': 1}, 'index2label': {0: 'SPORTS', 1: 'WORLD'}}
2 SCI/TECH {'label2index': {'SCI/TECH': 2, 'SPORTS': 0, 'WORLD': 1}, 'index2label': {0: 'SPORTS', 1: 'WORLD', 2: 'SCI/TECH'}}
3 BUSINESS {'label2index': {'SCI/TECH': 2, 'SPORTS': 0, 'WORLD': 1, 'BUSINESS': 3}, 'index2label': {0: 'SPORTS', 1: 'WORLD', 2: 'SCI/TECH', 3: 'BUSINESS'}}
Save file at  ./report//agnews.report



Description : Quick accurcy : 48.789474%
target : agnews, instance per class : 10


 ******************************agnews.report******************************
              precision    recall  f1-score   support

   BUSINESS       0.44      0.56      0.49      1900
   SCI/TECH       0.51      0.39      0.44      1900
     SPORTS       0.56      0.46      0.50      1900
      WORLD       0.48      0.54      0.51      1900

avg / total       0.50      0.49      0.49      7600

 Accuracy : 0.48789473684210527
*

Save file at  ./report//dbpedia.report



Description : Quick accurcy : 77.560000%
target : dbpedia, instance per class : 10


 ******************************dbpedia.report******************************
                         precision    recall  f1-score   support

                 ALBUM       0.80      0.97      0.88      5000
                ANIMAL       0.68      0.48      0.56      5000
                ARTIST       0.70      0.46      0.56      5000
               ATHLETE       0.83      0.87      0.85      5000
              BUILDING       0.75      0.65      0.70      5000
               COMPANY       0.71      0.62      0.66      5000
EDUCATIONALINSTITUTION       0.81      0.89      0.85      5000
                  FILM       0.87      0.96      0.91      5000
  MEANOFTRANSPORTATION       0.84      0.79      0.81      5000
          NATURALPLACE       0.81      0.82      0.82      5000
          OFFICEHOLDER       0.72      0.81      0.76      5000
                 PLANT     

Save file at  ./report//dbpedia.report



Description : Quick accurcy : 90.997143%
target : dbpedia, instance per class : 100


 ******************************dbpedia.report******************************
                         precision    recall  f1-score   support

                 ALBUM       0.91      0.98      0.95      5000
                ANIMAL       0.92      0.78      0.84      5000
                ARTIST       0.88      0.70      0.78      5000
               ATHLETE       0.95      0.96      0.96      5000
              BUILDING       0.93      0.88      0.90      5000
               COMPANY       0.87      0.86      0.86      5000
EDUCATIONALINSTITUTION       0.90      0.96      0.93      5000
                  FILM       0.93      0.97      0.95      5000
  MEANOFTRANSPORTATION       0.94      0.93      0.93      5000
          NATURALPLACE       0.93      0.98      0.95      5000
          OFFICEHOLDER       0.93      0.92      0.93      5000
                 PLANT    

Save file at  ./report//dbpedia.report



Description : Quick accurcy : 93.950000%
target : dbpedia, instance per class : 500


 ******************************dbpedia.report******************************
                         precision    recall  f1-score   support

                 ALBUM       0.94      0.98      0.96      5000
                ANIMAL       0.96      0.87      0.91      5000
                ARTIST       0.90      0.81      0.85      5000
               ATHLETE       0.97      0.97      0.97      5000
              BUILDING       0.96      0.91      0.94      5000
               COMPANY       0.88      0.91      0.89      5000
EDUCATIONALINSTITUTION       0.93      0.97      0.95      5000
                  FILM       0.94      0.97      0.96      5000
  MEANOFTRANSPORTATION       0.95      0.96      0.96      5000
          NATURALPLACE       0.95      0.98      0.97      5000
          OFFICEHOLDER       0.97      0.94      0.95      5000
                 PLANT    

Save file at  ./report//dbpedia.report



Description : Quick accurcy : 95.778571%
target : dbpedia, instance per class : 2000


 ******************************dbpedia.report******************************
                         precision    recall  f1-score   support

                 ALBUM       0.96      0.98      0.97      5000
                ANIMAL       0.97      0.94      0.96      5000
                ARTIST       0.92      0.88      0.90      5000
               ATHLETE       0.98      0.98      0.98      5000
              BUILDING       0.96      0.94      0.95      5000
               COMPANY       0.92      0.92      0.92      5000
EDUCATIONALINSTITUTION       0.95      0.98      0.96      5000
                  FILM       0.96      0.97      0.97      5000
  MEANOFTRANSPORTATION       0.97      0.97      0.97      5000
          NATURALPLACE       0.96      0.99      0.98      5000
          OFFICEHOLDER       0.97      0.95      0.96      5000
                 PLANT   

Save file at  ./report//amazon_six.report



Description : Quick accurcy : 85.088959%
target : amazon_six, instance per class : 50


 ******************************amazon_six.report******************************
                     precision    recall  f1-score   support

           CAMERAS       0.95      0.88      0.91     18925
           LAPTOPS       0.52      0.85      0.65      1655
       MOBILEPHONE       0.87      0.85      0.86      7291
           TABLETS       0.79      0.72      0.75      4902
               TVS       0.92      0.89      0.90      9422
VIDEO_SURVEILLANCE       0.49      0.74      0.59      2376

       avg / total       0.87      0.85      0.86     44571

 Accuracy : 0.8508895918871014
******************************END******************************


0 LAPTOPS {'label2index': {'LAPTOPS': 0}, 'index2label': {0: 'LAPTOPS'}}
1 TVS {'label2index': {'LAPTOPS': 0, 'TVS': 1}, 'index2label': {0: 'LAPTOPS', 1: 'TVS'}}
2 MOBILEPHONE {'label2index': {'LAPTOPS': 0, 

Save file at  ./report//amazon_six.report



Description : Quick accurcy : 89.683875%
target : amazon_six, instance per class : 2000


 ******************************amazon_six.report******************************
                     precision    recall  f1-score   support

           CAMERAS       0.98      0.91      0.94     18925
           LAPTOPS       0.69      0.88      0.78      1655
       MOBILEPHONE       0.93      0.87      0.90      7291
           TABLETS       0.81      0.85      0.83      4902
               TVS       0.97      0.92      0.94      9422
VIDEO_SURVEILLANCE       0.52      0.87      0.65      2376

       avg / total       0.92      0.90      0.90     44571

 Accuracy : 0.8968387516546633
******************************END******************************


0 CAMERAS {'label2index': {'CAMERAS': 0}, 'index2label': {0: 'CAMERAS'}}
1 TABLETS {'label2index': {'CAMERAS': 0, 'TABLETS': 1}, 'index2label': {0: 'CAMERAS', 1: 'TABLETS'}}
2 VIDEO_SURVEILLANCE {'label2in

Save file at  ./report//tripadvisor.report



Description : Quick accurcy : 58.379415%
target : tripadvisor, instance per class : 1000


 ******************************tripadvisor.report******************************
              precision    recall  f1-score   support

        1.0       0.51      0.82      0.63      2374
        2.0       0.36      0.35      0.35      2515
        3.0       0.40      0.43      0.41      4587
        4.0       0.54      0.46      0.50     11214
        5.0       0.73      0.73      0.73     14754

avg / total       0.59      0.58      0.58     35444

 Accuracy : 0.5837941541586729
******************************END******************************


0 2.0 {'label2index': {'2.0': 0}, 'index2label': {0: '2.0'}}
1 5.0 {'label2index': {'2.0': 0, '5.0': 1}, 'index2label': {0: '2.0', 1: '5.0'}}
2 3.0 {'label2index': {'2.0': 0, '5.0': 1, '3.0': 2}, 'index2label': {0: '2.0', 1: '5.0', 2: '3.0'}}
3 1.0 {'label2index': {'2.0': 0, '5.0': 1, '1.0': 3, '3.0': 2}, 'ind

Save file at  ./report//yahoo_a.report



Description : Quick accurcy : 48.738333%
target : yahoo_a, instance per class : 50


 ******************************yahoo_a.report******************************
                       precision    recall  f1-score   support

    BUSINESS&FINANCE       0.43      0.35      0.38      6000
  COMPUTERS&INTERNET       0.52      0.78      0.62      6000
 EDUCATION&REFERENCE       0.34      0.23      0.28      6000
 ENTERTAINMENT&MUSIC       0.44      0.38      0.41      6000
FAMILY&RELATIONSHIPS       0.44      0.74      0.55      6000
              HEALTH       0.56      0.55      0.55      6000
 POLITICS&GOVERNMENT       0.54      0.45      0.49      6000
 SCIENCE&MATHEMATICS       0.53      0.48      0.50      6000
     SOCIETY&CULTURE       0.43      0.37      0.40      6000
              SPORTS       0.64      0.55      0.59      6000

         avg / total       0.49      0.49      0.48     60000

 Accuracy : 0.48738333333333334
*****************

Save file at  ./report//yahoo_a.report



Description : Quick accurcy : 56.313333%
target : yahoo_a, instance per class : 200


 ******************************yahoo_a.report******************************
                       precision    recall  f1-score   support

    BUSINESS&FINANCE       0.52      0.37      0.43      6000
  COMPUTERS&INTERNET       0.56      0.84      0.67      6000
 EDUCATION&REFERENCE       0.47      0.29      0.36      6000
 ENTERTAINMENT&MUSIC       0.63      0.39      0.49      6000
FAMILY&RELATIONSHIPS       0.49      0.76      0.60      6000
              HEALTH       0.63      0.66      0.65      6000
 POLITICS&GOVERNMENT       0.66      0.56      0.61      6000
 SCIENCE&MATHEMATICS       0.52      0.64      0.58      6000
     SOCIETY&CULTURE       0.48      0.46      0.47      6000
              SPORTS       0.73      0.65      0.69      6000

         avg / total       0.57      0.56      0.55     60000

 Accuracy : 0.5631333333333334
*****************

Save file at  ./report//yahoo_a.report



Description : Quick accurcy : 61.948333%
target : yahoo_a, instance per class : 1000


 ******************************yahoo_a.report******************************
                       precision    recall  f1-score   support

    BUSINESS&FINANCE       0.56      0.42      0.48      6000
  COMPUTERS&INTERNET       0.64      0.84      0.73      6000
 EDUCATION&REFERENCE       0.52      0.39      0.45      6000
 ENTERTAINMENT&MUSIC       0.65      0.50      0.56      6000
FAMILY&RELATIONSHIPS       0.55      0.78      0.65      6000
              HEALTH       0.68      0.72      0.70      6000
 POLITICS&GOVERNMENT       0.68      0.66      0.67      6000
 SCIENCE&MATHEMATICS       0.59      0.70      0.64      6000
     SOCIETY&CULTURE       0.54      0.48      0.51      6000
              SPORTS       0.79      0.71      0.74      6000

         avg / total       0.62      0.62      0.61     60000

 Accuracy : 0.6194833333333334
****************

Save file at  ./report//yelp_f.report



Description : Quick accurcy : 26.552000%
target : yelp_f, instance per class : 10


 ******************************yelp_f.report******************************
              precision    recall  f1-score   support

          1       0.41      0.36      0.39     10000
          2       0.23      0.15      0.18     10000
          3       0.22      0.60      0.33     10000
          4       0.27      0.16      0.20     10000
          5       0.34      0.05      0.09     10000

avg / total       0.29      0.27      0.24     50000

 Accuracy : 0.26552
******************************END******************************


0 5 {'label2index': {'5': 0}, 'index2label': {0: '5'}}
1 3 {'label2index': {'3': 1, '5': 0}, 'index2label': {0: '5', 1: '3'}}
2 2 {'label2index': {'2': 2, '3': 1, '5': 0}, 'index2label': {0: '5', 1: '3', 2: '2'}}
3 4 {'label2index': {'2': 2, '3': 1, '4': 3, '5': 0}, 'index2label': {0: '5', 1: '3', 2: '2', 3: '4'}}
4 1 {'label2index': {'2