In [74]:
from csv import DictReader
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier


def read_data(name):
    text, targets = [], []

    with open('./../data/{}.csv'.format(name)) as f:
        for item in DictReader(f):
            text.append(item['text'].decode('utf8'))
            targets.append(item['category'])

    return text, targets


In [2]:
text_train, targets_train = read_data('train')

In [4]:
len(text_train), len(targets_train)

(14048, 14048)

In [5]:
text_test, targets_test = read_data('test')

In [6]:
len(text_test), len(targets_test)

(3599, 3599)

In [15]:
vect=TfidfVectorizer()
X = vect.fit_transform(text_train)

In [18]:
X.toarray().shape

(14048, 10696)

In [19]:
clf = LogisticRegression()
clf.fit(X, targets_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
vect1=TfidfVectorizer()
y = vect1.fit_transform(text_test)

In [21]:
y.toarray().shape

(3599, 5100)

In [22]:
preds = clf.predict(text_test)



ValueError: X has 3599 features per sample; expecting 10696

In [42]:
#train
import h5py
with h5py.File('./../pickles/final_training_data.h5','r') as hf:
    print('List of arrays in this file: \n', hf.keys())
    data = hf.get('dataset_1')
    train = np.array(data)
    print('Shape of the array dataset_1: \n', train.shape)

with h5py.File('./../pickles/final_test_data.h5','r') as hf:
    print('List of arrays in this file: \n', hf.keys())
    data = hf.get('dataset_1')
    test = np.array(data)
    print('Shape of the array dataset_1: \n', test.shape)

('List of arrays in this file: \n', [u'dataset_1'])
('Shape of the array dataset_1: \n', (14048, 50))
('List of arrays in this file: \n', [u'dataset_1'])
('Shape of the array dataset_1: \n', (3599, 50))


In [43]:
ytrain = pd.read_csv('./../pickles/training_targets.csv')
ytest = pd.read_csv('./../pickles/test_targets.csv')

In [None]:
model = make_pipeline(
        TfidfVectorizer(stop_words='english', ngram_range=(1,1)),
    TruncatedSVD(n_components=5000),
    LogisticRegression(class_weight='balanced'),
    ).fit(text_train, targets_train)

In [80]:
prediction = model.predict(text_test)

In [81]:
f1_score(targets_test, prediction, average='macro')

0.18388032350408906

In [68]:
print(classification_report(targets_test, prediction))

             precision    recall  f1-score   support

    animals       0.68      0.73      0.70        26
      faith       0.22      0.36      0.28        11
     family       0.60      0.84      0.70       110
    fashion       0.35      0.52      0.42        42
       food       0.47      0.70      0.56        63
      lgbtq       0.74      0.68      0.71       147
     meetup       0.77      0.78      0.78       737
   military       0.56      0.64      0.60        14
       misc       0.46      0.45      0.45       683
   personal       0.59      0.51      0.55       910
pop_culture       0.45      0.47      0.46       150
        qna       0.29      0.25      0.27       221
relationships       0.54      0.52      0.53       337
     school       0.50      0.64      0.56        74
     sports       0.22      0.60      0.32        10
     tatoos       0.72      0.87      0.79        15
       work       0.40      0.71      0.51        49

avg / total       0.57      0.57      0.56