# TEXT CLASSIFICATION

*Angélica María Gómez Buitrago*

*Juan Camilo Castillo Castro*

In [333]:
import pandas as pd
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Read Data

In [334]:
path = ''

In [335]:
dataTraining = pd.read_csv(os.path.join(path, 'data', 'dataTraining.csv'), encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv(os.path.join(path, 'data', 'dataTesting.csv'), encoding='UTF-8', index_col=0)

In [336]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [337]:
dataTraining.shape

(7895, 5)

# Create y

In [338]:
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))

le                     = MultiLabelBinarizer()
y_genres               = le.fit_transform(dataTraining['genres'])

In [339]:
y_genres.shape #Hay 24 categorías de películas

(7895, 24)

# Split train and test

Primero se debe hacer el split de datos y posteriormente sí aplicar el vectorizer solamente a los datos de X_train.

In [340]:
X_train, X_test, y_train_genres, y_test_genres = train_test_split(dataTraining['plot'],y_genres, test_size=0.33, random_state=42)

In [341]:
X_train.shape, X_test.shape, y_train_genres.shape, y_test_genres.shape

((5289,), (2606,), (5289, 24), (2606, 24))

# Create TfidVectorizer

In [414]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [460]:
vect = CountVectorizer(stop_words='english', max_features=1000,ngram_range=(1, 2))
X_train_dtm  = vect.fit_transform(X_train)
X_train_dtm.shape

(5289, 1000)

In [461]:
X_test_dtm = vect.transform(X_test)

In [462]:
X_test_dtm.shape

(2606, 1000)

# Train multi-class multi-label model

We tested different number of estimators and depht, selectin the following parameters in a Random Forest Classifier.

In [418]:
clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-5, n_estimators=220, max_depth=10, random_state=42))
clf 

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=220, n_jobs=-5,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          n_jobs=1)

In [419]:
clf.fit(X_train_dtm , y_train_genres)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=220, n_jobs=-5,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          n_jobs=1)

In [420]:
y_pred_genres = clf.predict_proba(X_test_dtm)
y_pred_genres

array([[0.14531685, 0.11088103, 0.02726866, ..., 0.25793315, 0.03846824,
        0.02567747],
       [0.14343473, 0.12798791, 0.02397985, ..., 0.22827102, 0.14170132,
        0.0254259 ],
       [0.18514528, 0.1343943 , 0.02396616, ..., 0.25190901, 0.03615788,
        0.0219488 ],
       ...,
       [0.17356591, 0.12667528, 0.02455989, ..., 0.25195057, 0.03483763,
        0.03004736],
       [0.14060639, 0.11217894, 0.02396616, ..., 0.24801523, 0.03406183,
        0.02115944],
       [0.14704403, 0.11406634, 0.02383524, ..., 0.26164256, 0.03406183,
        0.02115944]])

### ROC

In [421]:
roc_auc_score(y_test_genres, y_pred_genres, average='macro')

0.8238598812838025

In [422]:
y_pred_genres.shape

(2606, 24)

### Apply RF to kaggle test

In [423]:
X_test_k_dtm = vect.transform(dataTesting['plot'])

In [424]:
cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

y_pred_test_genres = clf.predict_proba(X_test_k_dtm)

pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols).to_csv('pred_genres_text_RF.csv', index_label='ID')

In [425]:
y_pred_test_genres.shape

(3383, 24)

# NEURAL NETWORK

Now we will try a basic Neural Network:

In [463]:
y_train_genres

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [464]:
y_pred_genres.shape

(2606, 24)

In [465]:
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras.optimizers import RMSprop
from keras.callbacks import History
from livelossplot import PlotLossesKeras

In [466]:
input_shape=X_train_dtm.shape[1:]
input_shape

(1000,)

In [467]:
model = Sequential()

model.add(Dense(units=256,input_shape=input_shape))
model.add(Activation('selu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(units=24))
model.add(Activation('sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_46 (Dense)             (None, 256)               256256    
_________________________________________________________________
activation_40 (Activation)   (None, 256)               0         
_________________________________________________________________
batch_normalization_21 (Batc (None, 256)               1024      
_________________________________________________________________
dropout_27 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 24)                6168      
_________________________________________________________________
activation_41 (Activation)   (None, 24)                0         
Total params: 263,448
Trainable params: 262,936
Non-trainable params: 512
_________________________________________________________________


In [468]:
from keras.optimizers import rmsprop
model.compile(loss = 'categorical_crossentropy',
              optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=None, decay=0.0),
              metrics = ['accuracy'])

In [469]:
model.fit(X_train_dtm, y_train_genres, epochs=50, verbose=2)

Epoch 1/50
 - 3s - loss: 9.3461 - acc: 0.0550
Epoch 2/50
 - 2s - loss: 8.8722 - acc: 0.0758
Epoch 3/50
 - 1s - loss: 8.5467 - acc: 0.1049
Epoch 4/50
 - 1s - loss: 8.2890 - acc: 0.1306
Epoch 5/50
 - 1s - loss: 8.0965 - acc: 0.1479
Epoch 6/50
 - 2s - loss: 7.9018 - acc: 0.1738
Epoch 7/50
 - 1s - loss: 7.7585 - acc: 0.1834
Epoch 8/50
 - 1s - loss: 7.6259 - acc: 0.1970
Epoch 9/50
 - 1s - loss: 7.4645 - acc: 0.2123
Epoch 10/50
 - 2s - loss: 7.3238 - acc: 0.2144
Epoch 11/50
 - 1s - loss: 7.2078 - acc: 0.2358
Epoch 12/50
 - 1s - loss: 7.0837 - acc: 0.2377
Epoch 13/50
 - 1s - loss: 6.9589 - acc: 0.2477
Epoch 14/50
 - 1s - loss: 6.8421 - acc: 0.2551
Epoch 15/50
 - 1s - loss: 6.7129 - acc: 0.2702
Epoch 16/50
 - 1s - loss: 6.5900 - acc: 0.2846
Epoch 17/50
 - 1s - loss: 6.4787 - acc: 0.2855
Epoch 18/50
 - 1s - loss: 6.3625 - acc: 0.2872
Epoch 19/50
 - 1s - loss: 6.2777 - acc: 0.2897
Epoch 20/50
 - 1s - loss: 6.1800 - acc: 0.2934
Epoch 21/50
 - 1s - loss: 6.1085 - acc: 0.2985
Epoch 22/50
 - 1s - lo

<keras.callbacks.History at 0x21d87f1fac8>

In [470]:
print(model.predict(X_test_dtm))

[[3.0628386e-01 2.4277601e-02 3.8436188e-03 ... 8.8317758e-01
  2.0321861e-02 2.5566021e-02]
 [2.2602795e-01 6.7917681e-01 5.5814948e-02 ... 1.2016610e-01
  9.2123103e-01 8.1038270e-03]
 [9.0923464e-01 5.4359949e-01 1.9043674e-03 ... 8.6974233e-01
  6.5683812e-04 3.0532244e-04]
 ...
 [4.2449674e-01 6.4177370e-01 2.2234070e-01 ... 6.6152476e-02
  2.6359099e-05 3.2629131e-03]
 [1.2384346e-03 2.0290844e-02 1.4278771e-02 ... 1.3498975e-01
  1.3781590e-03 3.2579515e-03]
 [5.5987889e-01 2.8047264e-01 1.0438242e-01 ... 9.1931540e-01
  8.7956684e-03 4.6066057e-02]]


In [471]:
score = model.evaluate(X_test_dtm,y_test_genres)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 6.726376350531647
Test accuracy: 0.3069838834147406


#### ROC

In [472]:
roc_auc_score(y_test_genres, model.predict(X_test_dtm), average='macro')

0.8094322701492113

#### Apply NN to kaggle test

In [437]:
cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

y_pred_test_genres_nn = model.predict_proba(X_test_k_dtm)

pd.DataFrame(y_pred_test_genres_nn, index=dataTesting.index, columns=cols).to_csv('pred_genres_text_NN.csv', index_label='ID')

# USING LSTM NN

Please go to this link:

https://colab.research.google.com/drive/1gE-_zTbGw6-Nzhif5Ozhp13aJh3s4D5m