In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline

In [53]:
df_scripts = pd.read_csv('data/scripts.csv')
df_scripts = df_scripts.drop("Unnamed: 2", axis=1)
df_tags = pd.read_csv('data/tags.csv')
df_tags = df_tags.drop("Unnamed: 2", axis=1)
# df_tags.head(n=2)

Unnamed: 0,Id,Tag
0,0,Romance
1,0,Comedy


In [54]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')

In [55]:
num_classes = 9
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [56]:
counts = df_tags.Tag.value_counts()
firstlast = counts[:5].append(counts[-5:])
# firstlast.reset_index(name="count")

Unnamed: 0,index,count
0,Drama,579
1,Thriller,373
2,Comedy,347
3,Action,290
4,Crime,201
5,Crime,201
6,Romance,192
7,Adventure,166
8,Horror,149
9,Mystery,107


In [57]:
def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_scripts = df_scripts.apply(add_tags_column, axis=1)
df_scripts[['Id', 'Body', 'Tags']].head()

Unnamed: 0,Id,Body,Tags
0,0,"['ustache, the hopeful look of youth in his ey...","[Romance, Comedy]"
1,1,"['DAYA restaurant supply truck is curbside, ne...","[Romance, Comedy, Drama]"
2,2,['LGRIM\'S STALL - MORNINGSunlit illuminates t...,"[Romance, Drama]"
3,3,['GICIAN is entertaining guests. Playing cards...,"[Romance, Comedy]"
4,4,['TER SCREENThe game is in progress. As a sick...,"[Romance, Comedy, Adventure]"


In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_scripts.Tags)
Y = multilabel_binarizer.transform(df_scripts.Tags)

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df_scripts.Body)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [59]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y)

In [69]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=42)

In [70]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Classifier: ", clf.__class__.__name__)
    print("Loss: {}".format(hamming_loss(y_pred, y_test_tfidf)))
    print("Score: {}".format(hamming_score(y_pred, y_test_tfidf)))
    print("---")

In [73]:
nb_clf = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, max_iter=10, tol=None, random_state=42)
lr = LogisticRegression(solver='liblinear')
mn = MultinomialNB()

for classifier in [nb_clf, sgd, lr, mn]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_train_tfidf, y_train_tfidf)
    y_pred = clf.predict(x_test_tfidf)
    print_score(y_pred, classifier)

Classifier:  MultinomialNB
Loss: 0.10578969774372073
Score: 0.04789272030651341
---
Classifier:  SGDClassifier
Loss: 0.035334184759472115
Score: 0.7423371647509579
---
Classifier:  LogisticRegression
Loss: 0.06939123031077055
Score: 0.3773946360153257
---
Classifier:  MultinomialNB
Loss: 0.10578969774372073
Score: 0.04789272030651341
---


In [72]:
# Credit goes to Karina H from Kagel for much of the code:
# https://www.kaggle.com/roccoli/multi-label-classification-with-sklearn