In [91]:
import pandas as pd
import numpy as np
import spacy
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as iPipeline
from nltk.tokenize import word_tokenize
from normalise import normalise
import warnings
import string
warnings.filterwarnings('ignore')

In [7]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [43]:
data = pd.read_csv('train.csv')

In [60]:
data_test = pd.read_csv('test.csv')

In [10]:
data_test.head()

Unnamed: 0,id,dialogue
0,0,Boy! Did you see the way Mama whopped that dep...
1,1,"Gordon, the insurance people are balking on th..."
2,2,Very fancy. Did you design the bottle? <BR> W...
3,3,It makes me so mad. Steven Schwimmer ready to ...
4,4,Something ought to loosen him up ... how comes...


In [11]:
unique_genres = set()
for genres in data.genres:
    genres_f = re.findall(r"\w{3,}", genres)
    for genre in genres_f:
        unique_genres.add(genre)

In [12]:
unique_genres

{'action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'drama',
 'family',
 'fantasy',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci',
 'sport',
 'thriller',
 'war',
 'western'}

In [13]:
data.head()

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"[u'drama', u'romance']"
1,1,1,Are you sure you're okay? You're pale. <BR> I...,[u'drama']
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,[u'comedy']
3,3,3,I could have lost my fucking hands. <BR> That ...,"[u'mystery', u'thriller']"
4,4,4,Stick with me on this Gloria. I need you... <...,"[u'crime', u'thriller']"


In [14]:
for genre in unique_genres:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x else 0)

In [33]:
data.dialogue = data.dialogue.apply(lambda x: re.sub(r'\d+', '', x))
data.dialogue = data.dialogue.apply(lambda x: x.strip())
data.dialogue = data.dialogue.apply(lambda x: x.replace('<BR>', ''))
#data.dialogue = data.dialogue.apply(lambda x: x.replace('-PRON-', ''))
data.dialogue = data.dialogue.apply(lambda x: re.findall(r'\w+', x))

In [94]:
stop_words = stopwords.words('english')
def drop_stop_words(tokenized_sentence: list, stop_words: list):
    return [t for t in tokenized_sentence if t not in stop_words]

def stem(tokenized_sentence: list, stemmer):
    return [stemmer.stem(word) for word in tokenized_sentence]
def lem(tokenized_sentence: list, wordnet_lemmatizer):
    lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize)
    lemmatized_text = ' '.join(lemmatize_words(tokenized_sentence))
    return lemmatized_text

In [98]:
wordnet_lemmatizer = WordNetLemmatizer()

In [17]:
data.dialogue = data.dialogue.apply(lambda x: drop_stop_words(x, stop_words))

In [18]:
sp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer(language='english')

In [19]:
data.dialogue = data.dialogue.apply(lambda x: stem(x, stemmer))

In [20]:
data.dialogue = data.dialogue.apply(lambda x: lem(x, sp))
data_test.dialogue = data_test.dialogue.apply(lambda x: x.replace('-PRON-', ''))

In [21]:
data.dialogue

0                                   i think meet br i with
1        be sure okay -PRON- pale br i feel like shit -...
2            go get br mom look say anyth first i pregnant
3        i could lost fuck hand br that would kept magi...
4        stick gloria i need br and tooth suzi -PRON- g...
                               ...                        
36986    there man downstair -PRON- bring -PRON- egg -P...
36987    hi br i prefer speak br i br know noth could s...
36988    i tri call i run littl late take long i think ...
36989    what crazi br i think talk between br what tal...
36990    i uh kill father dominus ominus rememb bind -P...
Name: dialogue, Length: 36991, dtype: object

In [104]:
#data.dialogue = data.dialogue.apply(lambda x: ' '.join(x))

In [105]:
data = pd.read_csv('train.csv')
data.dialogue = data.dialogue.apply(lambda x: re.sub(r'\d+', '', x))
data.dialogue = data.dialogue.apply(lambda x: x.strip())
data.dialogue = data.dialogue.apply(lambda x: x.replace(r'<BR>', ''))
data.dialogue = data.dialogue.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data.dialogue = data.dialogue.apply(lambda x: normalise(word_tokenize(x), verbose=False))
data.dialogue = data.dialogue.apply(lambda x: drop_stop_words(x, stop_words))
data.dialogue = data.dialogue.apply(lambda x: stem(x, stemmer))
data.dialogue = data.dialogue.apply(lambda x: lem(x, wordnet_lemmatizer))

In [23]:
def pipe (vectorizer, sampler, classifier):
    return(iPipeline([("vectorizer", vectorizer),
                      #("sampler", sampler),
                            ("classifier", classifier)]))

In [24]:
tv = TfidfVectorizer()
cv = CountVectorizer()
svc = LinearSVC()
lr = LogisticRegression()
sampler = RandomOverSampler()
pipe1 = pipe(tv, sampler, lr)

In [25]:
data_train, data_hold = train_test_split(data, test_size = 0.2)

In [26]:
data.shape, data_train.shape, data_hold.shape

((36991, 24), (29592, 24), (7399, 24))

In [27]:
def pipe (vectorizer, sampler, svd, classifier):
    return(iPipeline([("vectorizer", vectorizer),
                      #("svd", svd),
                      ("sampler", sampler),
                            ("classifier", classifier)]))

In [28]:
%%time
#sampler = RandomOverSampler()
sampler = SMOTE()
cls = LogisticRegression()
tv = TfidfVectorizer(min_df=5)
svd = TruncatedSVD(n_components=400, n_iter=10)
for genre in unique_genres:
    pipe1 = pipe(tv, sampler, svd, cls)
    pipe1.fit(data_train.dialogue, data_train[genre])
    data_hold[f'pred_{genre}'] = pipe1.predict_proba(data_hold['dialogue'])[:, 1]
    print(genre)

music
horror
romance
history
thriller
sci
war
musical
crime
western
sport
comedy
mystery
action
family
animation
fantasy
biography
drama
adventure
Wall time: 58.6 s


In [None]:
thresholds = {}
for t in np.arange(0.65, 0.99, 0.03):
    max_f = 0
    for genre in unique_genres:
        data_hold[f'round_{genre}'] = data_hold[f'pred_{genre}'].apply(lambda x: 1 if x > t else 0)
    data_hold = get_f_score(data_hold, unique_genres)
    mean_f = data_hold['f_score'].mean()
    print(f'{t} {mean_f}')
    #print(f'{genre} {f1_score(data_hold[genre], data_hold[col])} {precision_score(data_hold[genre], data_hold[col])}')

In [None]:
def get_singe_score(tmp_list):
    tp = tmp_list['tp']
    fp = tmp_list['fp']
    fn = tmp_list['fn']
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    return 2*p*r / (p+r)

In [None]:
def get_f_score(data_df, unique_genres):
    data = data_df.copy()
    data['tp'] = 0
    data['fp'] = 0
    data['fn'] = 0
    for genre in unique_genres:
        data['tp'] = data[[genre, f'round_{genre}', 'tp']].apply(lambda x: x[2] + 1 if (x[0] == x[1]) and (x[1]== 1) else x[2], axis=1)
        data['fp'] = data[[genre, f'round_{genre}', 'fp']].apply(lambda x: x[2] + 1 if (x[0] == 0) and (x[1] == 1) else x[2], axis=1)
        data['fn'] = data[[genre, f'round_{genre}', 'fn']].apply(lambda x: x[2] + 1 if (x[0] == 1) and (x[1] == 0) else x[2], axis=1)
    data['f_score'] = data[['tp', 'fp', 'fn']].apply(lambda x: get_singe_score(x), axis=1)
    return data

In [None]:
thresholds

In [None]:
#pred_genres = ['thriller', 'drama', 'crime', 'fantasy', 'romance', 'action', 'adventure', 'comedy', 'sci']

In [None]:
%%time
data_test = pd.read_csv('test.csv')
data_test.dialogue = data_test.dialogue.apply(lambda x: re.sub(r'\d+', '', x))
data_test.dialogue = data_test.dialogue.apply(lambda x: x.strip())
data_test.dialogue = data_test.dialogue.apply(lambda x: x.replace(r'<BR>', ''))
data_test.dialogue = data_test.dialogue.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data_test.dialogue = data_test.dialogue.apply(lambda x: normalise(word_tokenize(x), verbose=False))
data_test.dialogue = data_test.dialogue.apply(lambda x: drop_stop_words(x, stop_words))
data_test.dialogue = data_test.dialogue.apply(lambda x: stem(x, stemmer))
data_test.dialogue = data_test.dialogue.apply(lambda x: lem(x, wordnet_lemmatizer))

In [59]:
'text,123'.translate(str.maketrans('', '', string.punctuation))

'text123'

In [39]:
data.dialogue

0                            [i, think, meet, br, i, with]
1        [be, sure, okay, PRON, pale, br, i, feel, like...
2        [go, get, br, mom, look, say, anyth, first, i,...
3        [i, could, lost, fuck, hand, br, that, would, ...
4        [stick, gloria, i, need, br, and, tooth, suzi,...
                               ...                        
36986    [there, man, downstair, PRON, bring, PRON, egg...
36987    [hi, br, i, prefer, speak, br, i, br, know, no...
36988    [i, tri, call, i, run, littl, late, take, long...
36989    [what, crazi, br, i, think, talk, between, br,...
36990    [i, uh, kill, father, dominus, ominus, rememb,...
Name: dialogue, Length: 36991, dtype: object

In [None]:
%%time
sampler = SMOTE()
cls = LogisticRegression()
tv = TfidfVectorizer(min_df=5)
svd = TruncatedSVD(n_components=400, n_iter=10)
for genre in unique_genres:
    pipe1 = pipe(tv, sampler, svd, cls)
    pipe1.fit(data['dialogue'], data[genre])
    data_test[f'pred_{genre}'] = pipe1.predict_proba(data_test['dialogue'])[:, 1]
    data_test[genre] = data_test[f'pred_{genre}'].apply(lambda x: 1 if x > 0.8 else 0)

In [None]:
def get_final_result(predicts, unique_genres):
    ans = ''
    for i in range(len(unique_genres)):
        if predicts[i] == 1:
            ans = ans + ' ' + unique_genres[i]
    if ans == '':
        print('hi')
        return 'drama thriller'
    else:
        return ans[1:]

In [None]:
data_test['genres'] = data[list(unique_genres)].apply(lambda x: get_final_result(x.tolist(), list(unique_genres)), axis=1)

In [None]:
data_test[['id', 'genres']].to_csv('sub1_11_1.csv', index=False)

In [None]:
data_test[['dialogue', 'genres']]

In [None]:
import csv

In [None]:
data_test[possible_genres]