In [197]:
import pandas as pd
import numpy as np
import spacy
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.pipeline import Pipeline as iPipeline
from nltk.tokenize import word_tokenize
from normalise import normalise
import warnings
import string
warnings.filterwarnings('ignore')

In [7]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [43]:
data = pd.read_csv('train.csv')

In [60]:
data_test = pd.read_csv('test.csv')

In [10]:
data_test.head()

Unnamed: 0,id,dialogue
0,0,Boy! Did you see the way Mama whopped that dep...
1,1,"Gordon, the insurance people are balking on th..."
2,2,Very fancy. Did you design the bottle? <BR> W...
3,3,It makes me so mad. Steven Schwimmer ready to ...
4,4,Something ought to loosen him up ... how comes...


In [11]:
unique_genres = set()
for genres in data.genres:
    genres_f = re.findall(r"\w{3,}", genres)
    for genre in genres_f:
        unique_genres.add(genre)

In [12]:
unique_genres

{'action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'drama',
 'family',
 'fantasy',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci',
 'sport',
 'thriller',
 'war',
 'western'}

In [13]:
data.head()

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"[u'drama', u'romance']"
1,1,1,Are you sure you're okay? You're pale. <BR> I...,[u'drama']
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,[u'comedy']
3,3,3,I could have lost my fucking hands. <BR> That ...,"[u'mystery', u'thriller']"
4,4,4,Stick with me on this Gloria. I need you... <...,"[u'crime', u'thriller']"


In [14]:
for genre in unique_genres:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x else 0)

In [33]:
data.dialogue = data.dialogue.apply(lambda x: re.sub(r'\d+', '', x))
data.dialogue = data.dialogue.apply(lambda x: x.strip())
data.dialogue = data.dialogue.apply(lambda x: x.replace('<BR>', ''))
#data.dialogue = data.dialogue.apply(lambda x: x.replace('-PRON-', ''))
data.dialogue = data.dialogue.apply(lambda x: re.findall(r'\w+', x))

In [94]:
stop_words = stopwords.words('english')
def drop_stop_words(tokenized_sentence: list, stop_words: list):
    return [t for t in tokenized_sentence if t not in stop_words]

def stem(tokenized_sentence: list, stemmer):
    return [stemmer.stem(word) for word in tokenized_sentence]
def lem(tokenized_sentence: list, wordnet_lemmatizer):
    lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize)
    lemmatized_text = ' '.join(lemmatize_words(tokenized_sentence))
    return lemmatized_text

In [98]:
wordnet_lemmatizer = WordNetLemmatizer()

In [17]:
data.dialogue = data.dialogue.apply(lambda x: drop_stop_words(x, stop_words))

In [18]:
sp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer(language='english')

In [19]:
data.dialogue = data.dialogue.apply(lambda x: stem(x, stemmer))

In [20]:
data.dialogue = data.dialogue.apply(lambda x: lem(x, sp))
data_test.dialogue = data_test.dialogue.apply(lambda x: x.replace('-PRON-', ''))

In [147]:
for i in range(10):
    print(data['genres'].iloc[i])
    print(data[unique_genres].iloc[i])

[u'drama', u'romance']
music        0
horror       0
romance      1
history      0
thriller     0
sci          0
war          0
musical      0
crime        0
western      0
sport        0
comedy       0
mystery      0
action       0
family       0
animation    0
fantasy      0
biography    0
drama        1
adventure    0
Name: 0, dtype: int64
[u'drama']
music        0
horror       0
romance      0
history      0
thriller     0
sci          0
war          0
musical      0
crime        0
western      0
sport        0
comedy       0
mystery      0
action       0
family       0
animation    0
fantasy      0
biography    0
drama        1
adventure    0
Name: 1, dtype: int64
[u'comedy']
music        0
horror       0
romance      0
history      0
thriller     0
sci          0
war          0
musical      0
crime        0
western      0
sport        0
comedy       1
mystery      0
action       0
family       0
animation    0
fantasy      0
biography    0
drama        0
adventure    0
Name: 2, d

In [148]:
#data.dialogue = data.dialogue.apply(lambda x: ' '.join(x))

In [105]:
data = pd.read_csv('train.csv')
data.dialogue = data.dialogue.apply(lambda x: re.sub(r'\d+', '', x))
data.dialogue = data.dialogue.apply(lambda x: x.strip())
data.dialogue = data.dialogue.apply(lambda x: x.replace(r'<BR>', ''))
data.dialogue = data.dialogue.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data.dialogue = data.dialogue.apply(lambda x: normalise(word_tokenize(x), verbose=False))
data.dialogue = data.dialogue.apply(lambda x: drop_stop_words(x, stop_words))
data.dialogue = data.dialogue.apply(lambda x: stem(x, stemmer))
data.dialogue = data.dialogue.apply(lambda x: lem(x, wordnet_lemmatizer))

In [119]:
for genre in unique_genres:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x else 0)

In [120]:
def pipe (vectorizer, sampler, classifier):
    return(iPipeline([("vectorizer", vectorizer),
                      #("sampler", sampler),
                            ("classifier", classifier)]))

In [121]:
tv = TfidfVectorizer()
cv = CountVectorizer()
svc = LinearSVC()
lr = LogisticRegression()
sampler = RandomOverSampler()
pipe1 = pipe(tv, sampler, lr)

In [122]:
data_train, data_hold = train_test_split(data, test_size = 0.2)

In [123]:
data.shape, data_train.shape, data_hold.shape

((36991, 24), (29592, 24), (7399, 24))

In [218]:
def pipe (vectorizer, sampler, svd, classifier):
    return(iPipeline([("vectorizer", vectorizer),
                      #("svd", svd),
                      #("sampler", sampler),
                            ("classifier", classifier)]))

In [219]:
%%time
#sampler = RandomOverSampler()
sampler = ADASYN(n_neighbors=20)
cls = LogisticRegression()
tv = TfidfVectorizer(min_df=5)
svd = TruncatedSVD(n_components=400, n_iter=10)
for genre in unique_genres:
    pipe1 = pipe(tv, sampler, svd, cls)
    pipe1.fit(data_train.dialogue, data_train[genre])
    data_hold[f'pred_{genre}'] = pipe1.predict_proba(data_hold['dialogue'])[:, 1]
    print(genre)

music
horror
romance
history
thriller
sci
war
musical
crime
western
sport
comedy
mystery
action
family
animation
fantasy
biography
drama
adventure
Wall time: 48.7 s


In [None]:
thresholds = {}
for t in np.arange(0.15, 1, 0.05):
    max_f = 0
    for genre in unique_genres:
        data_hold[f'round_{genre}'] = data_hold[f'pred_{genre}'].apply(lambda x: 1 if x > t else 0)
    data_hold = get_f_score(data_hold, unique_genres)
    mean_f = data_hold['f_score'].mean()
    print(f'{t} {mean_f}')
    #print(f'{genre} {f1_score(data_hold[genre], data_hold[col])} {precision_score(data_hold[genre], data_hold[col])}')

0.15 0.5904589208751935
0.2 0.6220238899722614
0.25 0.637066141499173
0.30000000000000004 0.6380091046194574


In [183]:
def get_singe_score(tmp_list):
    tp = tmp_list['tp']
    if tp == 0:
        return 0
    fp = tmp_list['fp']
    fn = tmp_list['fn']
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    return 2*p*r / (p+r)

In [184]:
def get_f_score(data_df, unique_genres):
    data = data_df.copy()
    data['tp'] = 0
    data['fp'] = 0
    data['fn'] = 0
    for genre in unique_genres:
        data['tp'] = data[[genre, f'round_{genre}', 'tp']].apply(lambda x: x[2] + 1 if (x[0] == x[1]) and (x[1]== 1) else x[2], axis=1)
        data['fp'] = data[[genre, f'round_{genre}', 'fp']].apply(lambda x: x[2] + 1 if (x[0] == 0) and (x[1] == 1) else x[2], axis=1)
        data['fn'] = data[[genre, f'round_{genre}', 'fn']].apply(lambda x: x[2] + 1 if (x[0] == 1) and (x[1] == 0) else x[2], axis=1)
    data['f_score'] = data[['tp', 'fp', 'fn']].apply(lambda x: get_singe_score(x), axis=1)
    return data

In [None]:
thresholds

In [None]:
#pred_genres = ['thriller', 'drama', 'crime', 'fantasy', 'romance', 'action', 'adventure', 'comedy', 'sci']

In [106]:
%%time
data_test = pd.read_csv('test.csv')
data_test.dialogue = data_test.dialogue.apply(lambda x: re.sub(r'\d+', '', x))
data_test.dialogue = data_test.dialogue.apply(lambda x: x.strip())
data_test.dialogue = data_test.dialogue.apply(lambda x: x.replace(r'<BR>', ''))
data_test.dialogue = data_test.dialogue.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data_test.dialogue = data_test.dialogue.apply(lambda x: normalise(word_tokenize(x), verbose=False))
data_test.dialogue = data_test.dialogue.apply(lambda x: drop_stop_words(x, stop_words))
data_test.dialogue = data_test.dialogue.apply(lambda x: stem(x, stemmer))
data_test.dialogue = data_test.dialogue.apply(lambda x: lem(x, wordnet_lemmatizer))

Wall time: 7min 23s


In [107]:
data_test

Unnamed: 0,id,dialogue
0,0,boy did see way mama whip deputi shoe knock gu...
1,1,gordon insur peopl balk log truck tell spinele...
2,2,veri fanci did design bottl who el i m one run...
3,3,it make mad steven schwimmer readi strike the ...
4,4,someth ought loosen come go el paso rudi i jus...
...,...,...
9398,9398,in moment he go tell could someon it juliet in...
9399,9399,better you like yes fine good
9400,9400,hello hi just want talk right listen
9401,9401,dont ever call stupid come didnt mean anyth


In [39]:
data.dialogue

0                            [i, think, meet, br, i, with]
1        [be, sure, okay, PRON, pale, br, i, feel, like...
2        [go, get, br, mom, look, say, anyth, first, i,...
3        [i, could, lost, fuck, hand, br, that, would, ...
4        [stick, gloria, i, need, br, and, tooth, suzi,...
                               ...                        
36986    [there, man, downstair, PRON, bring, PRON, egg...
36987    [hi, br, i, prefer, speak, br, i, br, know, no...
36988    [i, tri, call, i, run, littl, late, take, long...
36989    [what, crazi, br, i, think, talk, between, br,...
36990    [i, uh, kill, father, dominus, ominus, rememb,...
Name: dialogue, Length: 36991, dtype: object

In [215]:
%%time
#sampler = SMOTE()
sampler = ADASYN(n_neighbors=20)
cls = LogisticRegression()
tv = TfidfVectorizer(min_df=5)
svd = TruncatedSVD(n_components=400, n_iter=10)
for genre in unique_genres:
    pipe1 = pipe(tv, sampler, svd, cls)
    pipe1.fit(data['dialogue'], data[genre])
    data_test[f'pred_{genre}'] = pipe1.predict_proba(data_test['dialogue'])[:, 1]
    data_test[genre] = data_test[f'pred_{genre}'].apply(lambda x: 1 if x > 0.25 else 0)

Wall time: 1min 1s


In [216]:
def get_final_result(predicts, unique_genres):
    ans = ''
    for i in range(len(unique_genres)):
        if predicts[i] == 1:
            ans = ans + ' ' + unique_genres[i]
    if ans == '':
        print('hi')
        return 'drama thriller'
    else:
        return ans[1:]

In [217]:
data_test['genres'] = data_test[list(unique_genres)].apply(lambda x: get_final_result(x.tolist(), list(unique_genres)), axis=1)

hi
hi
hi
hi
hi
hi
hi


In [206]:
data_test[['id', 'genres']].to_csv('sub1_11_1.csv', index=False)

In [None]:
data_test[['dialogue', 'genres']]

In [None]:
import csv

In [None]:
data_test[possible_genres]

In [136]:
data_test.genres

0          romance drama
1                  drama
2                 comedy
3       thriller mystery
4         thriller crime
              ...       
9398               drama
9399      thriller crime
9400    thriller mystery
9401        comedy drama
9402       romance drama
Name: genres, Length: 9403, dtype: object

In [156]:
data_hold[unique_genres].head(10)

Unnamed: 0,music,horror,romance,history,thriller,sci,war,musical,crime,western,sport,comedy,mystery,action,family,animation,fantasy,biography,drama,adventure
33445,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
12312,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
24044,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
10353,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
19528,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
19953,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10582,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
32053,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
25282,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
34444,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [157]:
data_hold[[f'round_{genre}' for genre in unique_genres]].head(10)

Unnamed: 0,round_music,round_horror,round_romance,round_history,round_thriller,round_sci,round_war,round_musical,round_crime,round_western,round_sport,round_comedy,round_mystery,round_action,round_family,round_animation,round_fantasy,round_biography,round_drama,round_adventure
33445,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24044,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10353,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
19528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10582,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
32053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [207]:
data_test.to_pickle('data_test.pkl')

In [208]:
data.to_pickle('data.pkl')

In [213]:
h = pd.read_pickle('data.pkl')

In [214]:
h.head()

Unnamed: 0,id,movie,dialogue,genres,music,horror,romance,history,thriller,sci,...,sport,comedy,mystery,action,family,animation,fantasy,biography,drama,adventure
0,0,0,i thought meet i with,"[u'drama', u'romance']",0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,are sure okay your pale i feel like shit me ri...,[u'drama'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,2,go get m o m look dont say anyth first i m pre...,[u'comedy'],0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3,3,i could lost suck hand that would kept magic c...,"[u'mystery', u'thriller']",0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,4,4,stick gloria i need and teeth arent suzi youll...,"[u'crime', u'thriller']",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
