In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss

%matplotlib inline

In [26]:
df_scripts = pd.read_csv('data/tweets.csv')
df_scripts = df_scripts.drop("Unnamed: 2", axis=1)
df_tags = pd.read_csv('data/tweets_tags.csv')
df_tags = df_tags.drop("Unnamed: 2", axis=1)
df_tags.head(n=5)

Unnamed: 0,Id,Tag
0,0,sadness
1,1,sadness
2,2,happiness
3,3,neutral
4,4,worry


In [27]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
print(grouped_tags)

         Tag  count
0  happiness   9270
1       hate   1433
2       love   3842
3    neutral   8638
4    sadness   5165
5      worry   8459


In [28]:
num_classes = 6
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [29]:
counts = df_tags.Tag.value_counts()
firstlast = counts[:5].append(counts[-5:])
# firstlast.reset_index(name="count")

In [30]:
def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_scripts = df_scripts.apply(add_tags_column, axis=1)
df_scripts[['Id', 'Body', 'Tags']].head()

Unnamed: 0,Id,Body,Tags
0,0,layin n bed with a headache ughhhh waitin on y...,[sadness]
1,1,funeral ceremony gloomy friday,[sadness]
2,2,wants to hang out with friends soon,[happiness]
3,3,we want to trade with someone who has houston ...,[neutral]
4,4,re pinging why didnt you go to prom bc my bf d...,[worry]


In [32]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_scripts.Tags)
Y = multilabel_binarizer.transform(df_scripts.Tags)

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df_scripts.Body.astype('U'))

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [33]:
ros = RandomOverSampler(random_state=42)
X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y)

In [34]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=42)

In [35]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Classifier: ", clf.__class__.__name__)
    print("Hamming Loss: {}".format(hamming_loss(y_pred, y_test_tfidf)))
    print("Hamming Score: {}".format(hamming_score(y_pred, y_test_tfidf)))
    print("---")

In [42]:
nb_clf = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, max_iter=50, tol=None, random_state=42)
lr = LogisticRegression(solver='liblinear')

for classifier in [nb_clf, sgd, lr]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_train_tfidf, y_train_tfidf)
    print(y_pred)
    print_score(y_pred, classifier)

ValueError: bad input shape (44496, 6)

In [38]:
with open('data/parsed2_imsdb/Comedy/hangoverthe.txt') as f:
    lines = f.read().splitlines()

In [40]:
y_train_tfidf

array([[0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1]])