In [51]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
from scipy.sparse import hstack
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
import itertools
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv('../data/dataset.csv', sep=';')

In [6]:
df.fillna('', inplace=True)

In [88]:
# remove entradas com tamanho total < 30 caracteres
df = df[df.apply(lambda r: len(r['text0'] + r['text1']) >= 30, axis=1)]

In [94]:
df.meme.value_counts()

Kermit-The-Frog-Drinking-Tea             118
What-If-I-Told-You                       115
Philosoraptor                            114
Scumbag-Steve                            113
The-Most-Interesting-Man-In-The-World    112
Joseph-Ducreux                           112
Bad-Luck-Brian                           111
One-Does-Not-Simply                      110
Good-Guy-Greg                            110
First-World-Problems                     107
Willy-Wonka                              106
Yo-Dawg                                  104
Winter-Is-Coming                         102
Conspiracy-Keanu                         101
Futurama-Fry                              95
Grumpy-Cat                                91
Insanity-Wolf                             88
Success-Kid                               86
X-X-Everywhere                            75
Y-U-No                                    63
Forever-Alone                             43
All-The-Things                            37
Ancient-Al

In [95]:
df = df[df.meme.isin(['Grumpy-Cat', 'Willy-Wonka', 'All-The-Things', 'Ancient-Aliens', 'Y-U-No', 'Kermit-The-Frog-Drinking-Tea'])]

In [None]:
params = [
    ('n_neighbors', [2, 3, 5]),
    ('metric', ['minkowski']),
    ('text0_ngram_range', [(2,3), (1, 2)]),
    ('text1_ngram_range', [(2,3), (1, 2)]),
    ('text_ngram_range', [(2,4), (1, 3)])
]

meme_enc = LabelEncoder().fit(df.meme.unique())

y = meme_enc.transform(df.meme)

for n_neighbor, metric, text0_ngram_range, text1_ngram_range, text_ngram_range in itertools.product(*map(lambda x: x[1], params)):

    bag_of_tagram_text0 = df.text0.apply(lambda text: ' '.join(map(lambda par: par[1], nltk.pos_tag(word_tokenize(text)))))
    bag_of_tagram_text1 = df.text1.apply(lambda text: ' '.join(map(lambda par: par[1], nltk.pos_tag(word_tokenize(text)))))
    
    tfidf_text0 = TfidfVectorizer(ngram_range=text0_ngram_range, min_df=.01).fit(bag_of_tagram_text0)
    tfidf_text1 = TfidfVectorizer(ngram_range=text1_ngram_range, min_df=.01).fit(bag_of_tagram_text1)
    tfidf_text = TfidfVectorizer(ngram_range=text_ngram_range, min_df=0).fit(df.apply(lambda r: r['text0'] + ' ' + r['text1'], axis=1))
    
    X0 = tfidf_text0.transform(bag_of_tagram_text0)
    X1 = tfidf_text1.transform(bag_of_tagram_text1)
    X2 = tfidf_text.transform(df.apply(lambda r: r['text0'] + ' ' + r['text1'], axis=1))
    
    X = hstack((X0, X1, X2))
    
    knn = KNeighborsClassifier(n_neighbors=n_neighbor, metric=metric)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size=.3)
    
    knn.fit(X_train, y_train)
    
    print({'n_neighbor': n_neighbor,
           'metric': metric,
           'text0_ngram_range': text0_ngram_range,
           'text1_ngram_range': text1_ngram_range,
           'text_ngram_range': text_ngram_range})
    print("\n")
    print(accuracy_score(y_test, knn.predict(X_test)))

In [137]:
meme_enc = LabelEncoder().fit(df.meme.unique())

y = meme_enc.transform(df.meme)

bag_of_tagram_text0 = df.text0.apply(lambda text: ' '.join(map(lambda par: par[1], nltk.pos_tag(word_tokenize(text)))))
bag_of_tagram_text1 = df.text1.apply(lambda text: ' '.join(map(lambda par: par[1], nltk.pos_tag(word_tokenize(text)))))

tfidf_text0 = TfidfVectorizer(ngram_range=(1, 3), min_df=.02).fit(bag_of_tagram_text0)
tfidf_text1 = TfidfVectorizer(ngram_range=(2, 3), min_df=.02).fit(bag_of_tagram_text1)
tfidf_text = TfidfVectorizer(ngram_range=(3, 6), analyzer='char').fit(df.apply(lambda r: r['text0'] + ' ' + r['text1'], axis=1))

X0 = tfidf_text0.transform(bag_of_tagram_text0)
X1 = tfidf_text1.transform(bag_of_tagram_text1)
X2 = tfidf_text.transform(df.apply(lambda r: r['text0'] + ' ' + r['text1'], axis=1))

X_all = hstack((X0, X1, X2))

knn_all = KNeighborsClassifier(n_neighbors=5, metric='minkowski')

knn_all.fit(X_all, y)

knn_words = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
knn_words.fit(X2, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [138]:
print(classification_report(y, knn_all.predict(X_all)))

             precision    recall  f1-score   support

          0       0.88      0.97      0.92        37
          1       0.71      0.97      0.82        33
          2       0.86      0.63      0.73        91
          3       0.85      0.97      0.91       118
          4       0.83      0.85      0.84       106
          5       0.75      0.62      0.68        63

avg / total       0.83      0.82      0.82       448



In [139]:
print(classification_report(y, knn_words.predict(X2)))

             precision    recall  f1-score   support

          0       0.78      0.97      0.87        37
          1       0.85      0.88      0.87        33
          2       0.86      0.68      0.76        91
          3       0.88      0.97      0.92       118
          4       0.89      0.88      0.88       106
          5       0.80      0.76      0.78        63

avg / total       0.86      0.85      0.85       448



In [140]:
def qual_meme_all(text0, text1):
    return meme_enc.inverse_transform(\
                    knn_all.predict(\
                    hstack((\
                            tfidf_text0.transform([text0]), 
                            tfidf_text1.transform([text1]), 
                            tfidf_text.transform([text0 + ' ' + text1])\
                          )\
                         )\
                               )\
                                      )[0]

def qual_meme_word(text0, text1):
    return meme_enc.inverse_transform(\
                    knn_words.predict(\
                            tfidf_text.transform([text0 + ' ' + text1])\
                                      ))[0]

In [145]:
textos = [
    ('Grumpy-Cat', 'have a nice day', 'don\'t tell me what to do'),
    ('Grumpy-Cat', 'I pretend I don\'t care but deep down', 'I really still don\'t care'),
    ('X-All-The-Things', 'Lose', 'All the blood'),
    ('X-All-The-Things', 'Friday night', 'Damage all the organs'),
    ('Ancient-Aliens', 'Fall asleep on couch... woke up in bed.', 'aliens'),
    ('Ancient-Aliens', 'We are not saying it was done by aliens', 'But it was done by aliens'),
    ('Y-U-No', '9gag', 'y u no use me anymore'),
    ('Y-U-No', 'people', 'y u no read this in normal voice'),
    ('Kermit-The-Frog-Drinking-Tea', 'So you\'re blatantly pregnant and buying cigarettes', 'but that\'s none of my business'),
    ('Kermit-The-Frog-Drinking-Tea', 'Your phone\'s screen is brighter than your future', 'but that\'s none of my business')
]

for meme, text0, text1 in textos:
    
    print(text0)
    print(text1)
    print()
    print(meme + '->' + qual_meme_word(text0, text1))
    print("\n-------\n")

have a nice day
don't tell me what to do

Grumpy-Cat->Grumpy-Cat

-------

I pretend I don't care but deep down
I really still don't care

Grumpy-Cat->Kermit-The-Frog-Drinking-Tea

-------

Lose
All the blood

X-All-The-Things->All-The-Things

-------

Friday night
Damage all the organs

X-All-The-Things->All-The-Things

-------

Fall asleep on couch... woke up in bed.
aliens

Ancient-Aliens->Ancient-Aliens

-------

We are not saying it was done by aliens
But it was done by aliens

Ancient-Aliens->Ancient-Aliens

-------

9gag
y u no use me anymore

Y-U-No->Y-U-No

-------

people
y u no read this in normal voice

Y-U-No->Y-U-No

-------

So you're blatantly pregnant and buying cigarettes
but that's none of my business

Kermit-The-Frog-Drinking-Tea->Kermit-The-Frog-Drinking-Tea

-------

Your phone's screen is brighter than your future
but that's none of my business

Kermit-The-Frog-Drinking-Tea->Kermit-The-Frog-Drinking-Tea

-------

