# Subtitle Classification #

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/00/92/6153f4912b84ee1ab53ab45663d23e7cf3704161cb5ef18b0c07e207cef2/transformers-4.7.0-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 30.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 35.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 39.6MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b2

In [None]:
import ast
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch

from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from scipy.special import softmax

from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [None]:
SUBTITLE_MODEL_PATH = 'drive/MyDrive/zt-models/subtitle_model.pt'

labels = {'action': 15,
          'action-adventure': 13,
          'action-comedy': 9,
          'action-horror': 11,
          'action-martialarts': 20,
          'action-scifi': 4,
          'action-spy': 8,
          'adventure': 3,
          'adventure-comedy': 2,
          'comedy': 18,
          'comedy-horror': 17,
          'comedy-martialarts': 12,
          'comedy-romance': 10,
          'comedy-scifi': 7,
          'comedy-spy': 16,
          'disaster': 0,
          'disaster-scifi': 1,
          'horror': 21,
          'horror-scifi': 5,
          'romance': 6,
          'romance-scifi': 14,
          'scifi': 19
          }

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.encoder = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))
        
    def forward(self, text, label):
        loss, text_feature = self.encoder(text, labels=label)[:2]
        return loss, text_feature

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

subtitle_model = BERT().to(device)
load_checkpoint(SUBTITLE_MODEL_PATH, subtitle_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Model loaded from <== drive/MyDrive/zt-models/subtitle_model.pt


0.2525549968926325

In [None]:
SUBTITLE_CSV_PATH = 'drive/MyDrive/zt-data/subtitle_data.csv'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_SEQ_LEN = 512
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

label_field = Field(sequential=False,
                    use_vocab=False,
                    batch_first=True,
                    dtype=torch.float
                   )
text_field = Field(use_vocab=False,
                   tokenize=tokenizer.encode,
                   lower=False,
                   include_lengths=False,
                   batch_first=True,
                   fix_length=MAX_SEQ_LEN,
                   pad_token=PAD_INDEX,
                   unk_token=UNK_INDEX
                  )

fields = [
    ('idx', text_field),
    ('genre', text_field),
    ('movie', text_field),
    ('text', text_field)
]

td = TabularDataset(path=SUBTITLE_CSV_PATH,
                    format="CSV",
                    fields=fields,
                    skip_header=True
                   )

pred_iter = Iterator(td,
                     batch_size=1,
                     device=device,
                     train=False,
                     shuffle=False,
                     sort=False
                    )

In [None]:
pd.read_csv(SUBTITLE_CSV_PATH).drop("Unnamed: 0", axis=1)

Unnamed: 0,genre,movie,text
0,['action'],thefastandthefurioustokyodrift,I got a feeling this is going to be a riot I d...
1,['action'],thefastandthefurioustokyodrift,Another day Tomorrow is another day Tomorrow i...
2,['action'],thefastandthefurioustokyodrift,"Shit! Yeah! I thought you loved me. Oh, well. ..."
3,['action'],thefastandthefurioustokyodrift,"get him off the streets. Well, there's just go..."
4,['action'],thefastandthefurioustokyodrift,yours. Make yourself comfortable. Lights out i...
...,...,...,...
798,['scifi'],womb,"you'd known, you obviously wouldn't have invit..."
799,['scifi'],womb,Sure. Come on. And you there when Father died?...
800,['scifi'],womb,"for couple of things, she'll be back soon. She..."
801,['scifi'],womb,and never repeat themselves. What's the matter...


In [None]:
switched_labels = {labels[i]:i for i in labels.keys()}

probabilities = {sl:[] for sl in labels}
predicted_output = []

subtitle_model.eval()
with torch.no_grad():
  for (idx, genre, movie, text), _ in (pred_iter):
    # label = label.type(torch.LongTensor)
    # label = label.to(device)
    text = text.type(torch.LongTensor)
    text = text.to(device)
    _, output = subtitle_model(text, torch.zeros([1], dtype=torch.int64).to(device))

    subtitle_output = softmax(output.tolist()[0])

    for x in range(len(subtitle_output)):
      probabilities[switched_labels[x]].append(subtitle_output[x])
    predicted_output.append(switched_labels[np.argmax(subtitle_output)])

In [None]:
pd_pre = pd.DataFrame(predicted_output)
pd_pre.columns = ['predicted_output']

pd_pro = pd.DataFrame(probabilities)

data = pd.read_csv(SUBTITLE_CSV_PATH)

data_pred = pd.concat([data, pd_pro, pd_pre], axis=1).drop("Unnamed: 0", axis=1)
data_pred

Unnamed: 0,genre,movie,text,action,action-adventure,action-comedy,action-horror,action-martialarts,action-scifi,action-spy,adventure,adventure-comedy,comedy,comedy-horror,comedy-martialarts,comedy-romance,comedy-scifi,comedy-spy,disaster,disaster-scifi,horror,horror-scifi,romance,romance-scifi,scifi,predicted_output
0,['action'],thefastandthefurioustokyodrift,I got a feeling this is going to be a riot I d...,0.000803,0.006290,0.000309,0.003490,0.000433,0.001580,0.002679,0.000742,0.003221,0.000141,0.001173,0.001390,0.000524,0.000589,0.957888,0.001347,0.013404,0.000645,0.000449,0.000272,0.000356,0.002275,comedy-spy
1,['action'],thefastandthefurioustokyodrift,Another day Tomorrow is another day Tomorrow i...,0.000499,0.000312,0.004062,0.000617,0.000570,0.000992,0.000466,0.000600,0.000467,0.002012,0.000396,0.000964,0.002333,0.000811,0.000389,0.000605,0.000115,0.002419,0.000989,0.978968,0.001127,0.000290,romance
2,['action'],thefastandthefurioustokyodrift,"Shit! Yeah! I thought you loved me. Oh, well. ...",0.017049,0.002316,0.059661,0.006764,0.009894,0.008172,0.006057,0.004158,0.001152,0.021590,0.005913,0.004668,0.007539,0.006493,0.002098,0.003549,0.001193,0.153383,0.005814,0.647873,0.002275,0.022390,romance
3,['action'],thefastandthefurioustokyodrift,"get him off the streets. Well, there's just go...",0.046872,0.002409,0.014656,0.080509,0.018898,0.015002,0.001282,0.005592,0.002854,0.297748,0.014249,0.035886,0.013865,0.067763,0.010476,0.004624,0.001667,0.251386,0.006716,0.092887,0.002567,0.012090,comedy
4,['action'],thefastandthefurioustokyodrift,yours. Make yourself comfortable. Lights out i...,0.069686,0.001805,0.050783,0.007125,0.003779,0.004698,0.001466,0.003060,0.011681,0.112155,0.005784,0.491691,0.011007,0.139368,0.020543,0.006084,0.002255,0.009924,0.009144,0.031590,0.005145,0.001226,comedy-martialarts
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798,['scifi'],womb,"you'd known, you obviously wouldn't have invit...",0.002466,0.202433,0.002963,0.003176,0.006072,0.002726,0.001235,0.007071,0.008107,0.000595,0.047322,0.001932,0.007175,0.094879,0.003254,0.003699,0.367350,0.101123,0.028942,0.001744,0.098584,0.007152,disaster-scifi
799,['scifi'],womb,Sure. Come on. And you there when Father died?...,0.002649,0.013355,0.002294,0.001242,0.009086,0.006258,0.000668,0.003633,0.006117,0.002760,0.009757,0.007314,0.111955,0.428926,0.002146,0.002286,0.052913,0.273912,0.002740,0.002905,0.007841,0.049243,comedy-scifi
800,['scifi'],womb,"for couple of things, she'll be back soon. She...",0.001718,0.003405,0.046537,0.007571,0.002128,0.002346,0.002380,0.002645,0.002371,0.041354,0.259077,0.003302,0.094766,0.040387,0.002164,0.007460,0.002944,0.201964,0.005881,0.229168,0.037914,0.002520,comedy-horror
801,['scifi'],womb,and never repeat themselves. What's the matter...,0.003084,0.004104,0.004214,0.001147,0.006123,0.005355,0.001002,0.003712,0.001123,0.005882,0.007289,0.004440,0.151546,0.059407,0.000904,0.000853,0.004978,0.686125,0.002739,0.007979,0.004713,0.033279,horror


In [None]:
SUBTITLE_CSV_OUTPUT_PATH = 'drive/MyDrive/zt-data/subtitle_output.csv'

avg_pred = data_pred.groupby("movie").mean().reset_index()
avg_pred.to_csv(SUBTITLE_CSV_OUTPUT_PATH)
avg_pred

Unnamed: 0,movie,action,action-adventure,action-comedy,action-horror,action-martialarts,action-scifi,action-spy,adventure,adventure-comedy,comedy,comedy-horror,comedy-martialarts,comedy-romance,comedy-scifi,comedy-spy,disaster,disaster-scifi,horror,horror-scifi,romance,romance-scifi,scifi
0,jumanjiwelcometothejungle,0.004471,0.018841,0.023145,0.00903,0.010866,0.009701,0.0025,0.046642,0.567359,0.008584,0.002516,0.00292,0.052982,0.019481,0.020226,0.001894,0.094284,0.005825,0.01435,0.003476,0.04738,0.033527
1,paul,0.015709,0.0109,0.037626,0.01785,0.002715,0.001885,0.008927,0.002839,0.024975,0.003016,0.072353,0.002711,0.09865,0.343654,0.091958,0.016759,0.071771,0.009092,0.008711,0.00399,0.150927,0.002984
2,shaolinsoccer,0.004325,0.000904,0.001832,0.000671,0.737919,0.002934,0.000778,0.000809,0.023492,0.005563,0.001786,0.197503,0.009391,0.000973,0.001299,0.000624,0.001013,0.001214,0.00077,0.003476,0.001146,0.001579
3,spectre,0.009993,0.003009,0.003736,0.013784,0.00092,0.001305,0.760122,0.000998,0.000777,0.000525,0.002854,0.000943,0.000708,0.000718,0.151739,0.002222,0.001336,0.000825,0.001546,0.001182,0.037736,0.003022
4,sputnik,0.021607,0.012782,0.005983,0.096118,0.013483,0.040491,0.020508,0.009909,0.030388,0.01115,0.024992,0.003112,0.015266,0.032092,0.018841,0.02547,0.247917,0.005628,0.135307,0.002965,0.178293,0.047698
5,thecore,0.073664,0.00446,0.071777,0.072433,0.002319,0.003003,0.023306,0.002321,0.00311,0.00202,0.017105,0.001865,0.002622,0.015043,0.062144,0.177318,0.169892,0.003347,0.065631,0.003942,0.219069,0.003608
6,thediscovery,0.006893,0.013087,0.006468,0.007908,0.019114,0.00296,0.012321,0.033751,0.01167,0.003776,0.02342,0.001581,0.003782,0.027867,0.003454,0.194864,0.43347,0.011818,0.037746,0.023828,0.016742,0.103478
7,theedgeofseventeen,0.005848,0.005383,0.063699,0.038999,0.003388,0.008555,0.014206,0.00281,0.025613,0.0664,0.022122,0.003525,0.252197,0.039434,0.033649,0.011063,0.002979,0.043475,0.046586,0.171195,0.116755,0.022118
8,thefastandthefurioustokyodrift,0.188028,0.002104,0.045129,0.013408,0.029566,0.05154,0.00457,0.002702,0.063673,0.104576,0.00599,0.199198,0.046954,0.048879,0.058845,0.002677,0.004897,0.022324,0.004132,0.087513,0.002552,0.010742
9,thefinalmaster,0.002652,0.001341,0.000339,0.000623,0.951174,0.003302,0.000617,0.00134,0.011637,0.00101,0.000511,0.013079,0.001534,0.002724,0.000502,0.000282,0.001193,0.000755,0.000475,0.001053,0.000786,0.003073


# Movie Poster Classification

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.preprocessing import image
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from PIL import Image
import sklearn
from sklearn.model_selection import train_test_split
from statistics import mean
from skimage import color
from skimage import io
from keras import models

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
test_df = pd.read_csv("drive/MyDrive/poster-dataset/finegrained_poster_test_data_multihotencoded.csv")

In [None]:
title = np.asarray(test_df.iloc[:,1])
print(title)

['jumanjiwelcometothejungle' 'paul' 'shaolinsoccer' 'spectre' 'sputnik'
 'thecore' 'thediscovery' 'theedgeofseventeen'
 'thefastandthefurioustokyodrift' 'thefinalmaster' 'thekissingbooth'
 'theperfectstorm' 'thephotograph' 'thering'
 'theseventhadventuresofsinbad' 'thismeanswar' 'triplefrontier'
 'tronlegacy' 'vampiresvsthebronx' 'womb' 'worldwarz' 'zoolander2']


In [None]:
def arrange_data(df):
    
    image_data = []
    print(np.asarray(df.iloc[:, 1]))
    img_paths = "drive/MyDrive/poster-dataset/" + title+ ".jpg" #First column is the image paths
    
    for i in tqdm(range(len(img_paths))):
    
        img = image.load_img(img_paths[i], target_size=(200, 150, 3))
        img = image.img_to_array(img)
        img = img/255
        image_data.append(img)
        
        
    X = np.array(image_data)
    Y = np.array(df.iloc[:,3:12])
    
    print("Shape of images:", X.shape)
    print("Shape of labels:", Y.shape)
    
    return X, Y

In [None]:
X_test, Y_test = arrange_data (test_df)

  0%|          | 0/22 [00:00<?, ?it/s]

['jumanjiwelcometothejungle' 'paul' 'shaolinsoccer' 'spectre' 'sputnik'
 'thecore' 'thediscovery' 'theedgeofseventeen'
 'thefastandthefurioustokyodrift' 'thefinalmaster' 'thekissingbooth'
 'theperfectstorm' 'thephotograph' 'thering'
 'theseventhadventuresofsinbad' 'thismeanswar' 'triplefrontier'
 'tronlegacy' 'vampiresvsthebronx' 'womb' 'worldwarz' 'zoolander2']


100%|██████████| 22/22 [00:10<00:00,  2.14it/s]

Shape of images: (22, 200, 150, 3)
Shape of labels: (22, 9)





In [None]:
model = models.load_model("drive/MyDrive/poster-dataset/poster-model.h5")

In [None]:
pred = model.predict(np.array(X_test))

In [None]:
mc_labels = {'action': 15,
          'action-adventure': 13,
          'action-comedy': 9,
          'action-horror': 11,
          'action-martialarts': 20,
          'action-scifi': 4,
          'action-spy': 8,
          'adventure': 3,
          'adventure-comedy': 2,
          'comedy': 18,
          'comedy-horror': 17,
          'comedy-martialarts': 12,
          'comedy-romance': 10,
          'comedy-scifi': 7,
          'comedy-spy': 16,
          'disaster': 0,
          'disaster-scifi': 1,
          'horror': 21,
          'horror-scifi': 5,
          'romance': 6,
          'romance-scifi': 14,
          'scifi': 19
          }
mc_labels = mc_labels.keys()

# HAIII gua ganti disini, so run this plz tengs
classes = ['disaster','scifi','adventure','comedy','action','horror','romance','spy','martialarts']
probs = {cs:[] for cs in classes}
probs['movie'] = []

for i in range(len(title)):
  print(title[i])
  probs['movie'].append(title[i])
  for j in range(9):
    print(classes[j], pred[i][j])
    probs[classes[j]].append(pred[i][j])
  print()

jumanjiwelcometothejungle
disaster 0.019241903
scifi 0.29244015
adventure 0.10831066
comedy 0.29614612
action 0.3243328
horror 0.14311662
romance 0.04424445
spy 0.022535944
martialarts 0.035377007

paul
disaster 0.01213759
scifi 0.08194498
adventure 0.06865213
comedy 0.7675262
action 0.24918447
horror 0.093895555
romance 0.026543772
spy 0.080883436
martialarts 0.029356677

shaolinsoccer
disaster 0.013690669
scifi 0.017455721
adventure 0.08635515
comedy 0.8678256
action 0.43370375
horror 0.036318015
romance 0.10425622
spy 0.05962658
martialarts 0.15626884

spectre
disaster 0.010919392
scifi 0.085278705
adventure 0.17778946
comedy 0.06260592
action 0.6206375
horror 0.09608509
romance 0.15865663
spy 0.03608897
martialarts 0.09026866

sputnik
disaster 0.04447958
scifi 0.58412486
adventure 0.123710714
comedy 0.031850714
action 0.17313114
horror 0.45113605
romance 0.10698483
spy 0.019691408
martialarts 0.04145479

thecore
disaster 0.029168252
scifi 0.43484667
adventure 0.18471095
comedy 0.16

In [None]:
# here too

result = pd.DataFrame(probs)
result

Unnamed: 0,disaster,scifi,adventure,comedy,action,horror,romance,spy,martialarts,movie
0,0.019242,0.29244,0.108311,0.296146,0.324333,0.143117,0.044244,0.022536,0.035377,jumanjiwelcometothejungle
1,0.012138,0.081945,0.068652,0.767526,0.249184,0.093896,0.026544,0.080883,0.029357,paul
2,0.013691,0.017456,0.086355,0.867826,0.433704,0.036318,0.104256,0.059627,0.156269,shaolinsoccer
3,0.010919,0.085279,0.177789,0.062606,0.620637,0.096085,0.158657,0.036089,0.090269,spectre
4,0.04448,0.584125,0.123711,0.031851,0.173131,0.451136,0.106985,0.019691,0.041455,sputnik
5,0.029168,0.434847,0.184711,0.166416,0.05349,0.44288,0.190061,0.044522,0.070855,thecore
6,0.025035,0.32188,0.066423,0.186141,0.30858,0.214327,0.128466,0.032866,0.039207,thediscovery
7,0.011002,0.055283,0.028363,0.917928,0.196257,0.041847,0.065446,0.128566,0.22002,theedgeofseventeen
8,0.018439,0.061781,0.028189,0.887742,0.382736,0.022728,0.055999,0.078773,0.103396,thefastandthefurioustokyodrift
9,0.006348,0.022357,0.026144,0.781362,0.579691,0.027352,0.013903,0.244544,0.374748,thefinalmaster


In [None]:
cols = list(result.columns)
cols = [cols[-1]] + cols[:-1]
result = result[cols]
result

Unnamed: 0,movie,disaster,scifi,adventure,comedy,action,horror,romance,spy,martialarts
0,jumanjiwelcometothejungle,0.019242,0.29244,0.108311,0.296146,0.324333,0.143117,0.044244,0.022536,0.035377
1,paul,0.012138,0.081945,0.068652,0.767526,0.249184,0.093896,0.026544,0.080883,0.029357
2,shaolinsoccer,0.013691,0.017456,0.086355,0.867826,0.433704,0.036318,0.104256,0.059627,0.156269
3,spectre,0.010919,0.085279,0.177789,0.062606,0.620637,0.096085,0.158657,0.036089,0.090269
4,sputnik,0.04448,0.584125,0.123711,0.031851,0.173131,0.451136,0.106985,0.019691,0.041455
5,thecore,0.029168,0.434847,0.184711,0.166416,0.05349,0.44288,0.190061,0.044522,0.070855
6,thediscovery,0.025035,0.32188,0.066423,0.186141,0.30858,0.214327,0.128466,0.032866,0.039207
7,theedgeofseventeen,0.011002,0.055283,0.028363,0.917928,0.196257,0.041847,0.065446,0.128566,0.22002
8,thefastandthefurioustokyodrift,0.018439,0.061781,0.028189,0.887742,0.382736,0.022728,0.055999,0.078773,0.103396
9,thefinalmaster,0.006348,0.022357,0.026144,0.781362,0.579691,0.027352,0.013903,0.244544,0.374748


In [None]:
result.to_csv('poster_output.csv', index=False)

# ZtFeat3

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing import image
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
from keras import models
from keras import layers
from keras import optimizers
from PIL import Image
import sklearn
from sklearn.model_selection import train_test_split
# from statistics import mean

In [None]:
train_df = pd.read_csv('train_feat43.csv').drop("Unnamed: 0", axis=1)
train_df.head()

Unnamed: 0,movie,total_sent_num,avg_dialogue_speed,avg_word_per_sent,ANG,DIS,FEA,HAP,ND,NEU,SAD,SUR,disaster,scifi,action,horror,romance,comedy,spy,martialarts,adventure
0,100earthquake,1610,2.947354,26.067081,3,0,8,7,134,8,19,0,1,0,0,0,0,0,0,0,0
1,2012,1963,2.71789,29.704534,1,0,3,3,320,3,15,0,1,1,0,0,0,0,0,0,0
2,alitabattleangel,1392,2.713729,24.474856,0,0,1,6,322,1,0,3,0,1,1,0,0,0,0,0,0
3,annihilation,1087,2.723397,24.952162,0,0,0,1,207,0,0,0,0,1,0,1,0,0,0,0,0
4,awalktoremember,1114,1.990317,23.29623,4,0,12,7,173,11,7,1,0,0,0,0,1,0,0,0,0


In [None]:
train_df.describe()

Unnamed: 0,total_sent_num,avg_dialogue_speed,avg_word_per_sent,ANG,DIS,FEA,HAP,ND,NEU,SAD,SUR,disaster,scifi,action,horror,romance,comedy,spy,martialarts,adventure
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0
mean,1462.090909,2.597075,26.56636,10.511364,0.170455,15.738636,14.681818,202.965909,10.704545,22.920455,2.045455,0.090909,0.272727,0.318182,0.181818,0.136364,0.363636,0.090909,0.090909,0.136364
std,509.0656,0.371536,6.169499,7.94376,0.434744,10.965457,15.854616,54.074331,8.645189,13.336685,2.782991,0.289127,0.447914,0.46844,0.387905,0.345141,0.483802,0.289127,0.289127,0.345141
min,466.0,1.646192,18.181556,0.0,0.0,0.0,1.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1085.0,2.373417,23.532443,5.0,0.0,7.0,5.0,157.0,4.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1434.0,2.577082,25.359333,9.0,0.0,14.0,9.0,210.5,9.5,22.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1718.25,2.80771,27.867764,13.25,0.0,24.0,17.25,242.0,15.25,32.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2956.0,3.476014,61.569643,38.0,2.0,50.0,66.0,322.0,45.0,52.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# train_df['total_sent_num'] = (train_df['total_sent_num'] - train_df['total_sent_num'].min()) / (train_df['total_sent_num'].max() - train_df['total_sent_num'].min())
# train_df['avg_dialogue_speed'] = (train_df['avg_dialogue_speed'] - train_df['avg_dialogue_speed'].min()) / (train_df['avg_dialogue_speed'].max() - train_df['avg_dialogue_speed'].min())
# train_df['avg_word_per_sent'] = (train_df['avg_word_per_sent'] - train_df['avg_word_per_sent'].min()) / (train_df['avg_word_per_sent'].max() - train_df['avg_word_per_sent'].min())

# train_df

In [None]:
labels = np.array(train_df.iloc[:,12:])
labels[:5]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0]])

In [None]:
features = train_df.columns[1:11]
features = train_df[features]

In [None]:
# feature_list = list(features.columns)
features = np.array(features)

features[:5]

array([[1.61000000e+03, 2.94735402e+00, 2.60670807e+01, 3.00000000e+00,
        0.00000000e+00, 8.00000000e+00, 7.00000000e+00, 1.34000000e+02,
        8.00000000e+00, 1.90000000e+01],
       [1.96300000e+03, 2.71789000e+00, 2.97045339e+01, 1.00000000e+00,
        0.00000000e+00, 3.00000000e+00, 3.00000000e+00, 3.20000000e+02,
        3.00000000e+00, 1.50000000e+01],
       [1.39200000e+03, 2.71372895e+00, 2.44748563e+01, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 6.00000000e+00, 3.22000000e+02,
        1.00000000e+00, 0.00000000e+00],
       [1.08700000e+03, 2.72339725e+00, 2.49521619e+01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.07000000e+02,
        0.00000000e+00, 0.00000000e+00],
       [1.11400000e+03, 1.99031669e+00, 2.32962298e+01, 4.00000000e+00,
        0.00000000e+00, 1.20000000e+01, 7.00000000e+00, 1.73000000e+02,
        1.10000000e+01, 7.00000000e+00]])

In [None]:
# # feature_list = list(features.columns)
# features['total_sent_num'] = features['total_sent_num'].apply(lambda x:x/1000)
# features['avg_word_per_sent'] = features['avg_word_per_sent'].apply(lambda x:x/10)
# features = np.array(features)

# features[:5]

In [None]:
print(features.shape)
print(labels.shape)

(88, 10)
(88, 9)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [1, 2],
    'max_features': [2, 3, 5, 7, 9],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(features, labels)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  6.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
regr = RandomForestRegressor(max_depth=1)
# features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.25, stratify=labels)
# print(features_train.shape)
# print(labels_train.shape)
# print(features_test.shape)
# print(labels_test.shape)
regr.fit(features,labels)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=1, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
pred = regr.predict(features_test)

In [None]:
def accuracy(pred):
  count = 0
  for i in range(len(pred)):
    value = 0
          
    first2_index = np.argsort(pred[i])[-2:]
    correct = np.where(labels_test[i] == 1)[0]
    for j in first2_index:
      if j in correct:
        value += 1
                  
    if (value>0):
      count=count+1
    
  print("Accuracy = ", count/len(pred))

In [None]:
print(accuracy(pred))

Accuracy =  0.7272727272727273
None


## Test Data

In [None]:
test_df = pd.read_csv('test_feat43.csv').drop("Unnamed: 0", axis=1)
test_df

Unnamed: 0,movie,total_sent_num,avg_dialogue_speed,avg_word_per_sent,ANG,DIS,FEA,HAP,ND,NEU,SAD,SUR,disaster,scifi,action,horror,romance,comedy,spy,martialarts,adventure
0,jumanjiwelcometothejungle,1861,2.316457,23.372918,17,1,26,13,220,19,35,1,0,0,0,0,0,1,0,0,1
1,paul,2282,2.831326,22.14461,20,0,24,23,240,10,39,6,0,1,0,0,0,1,0,0,0
2,shaolinsoccer,1251,2.318613,22.814548,21,0,16,13,152,10,33,2,0,0,0,0,0,1,0,1,0
3,spectre,1228,2.26247,27.491042,12,0,25,13,237,9,23,2,0,0,1,0,0,0,1,0,0
4,sputnik,993,3.098275,31.767372,4,0,3,5,235,2,23,0,0,1,0,1,0,0,0,0,0
5,thecore,2110,2.913491,26.554028,1,0,3,2,239,4,10,0,1,1,0,0,0,0,0,0,0
6,thediscovery,1396,2.988863,31.919771,6,0,8,16,127,4,14,2,0,1,0,0,1,0,0,0,0
7,theedgeofseventeen,1705,3.05623,23.822874,13,0,31,35,109,28,55,26,0,0,0,0,0,1,0,0,0
8,thefastandthefurioustokyodrift,705,2.715657,31.390071,12,0,12,13,190,7,20,0,0,0,1,0,0,0,0,0,0
9,thefinalmaster,719,1.952446,33.002782,6,0,8,9,138,7,29,0,0,0,1,0,0,0,0,1,0


In [None]:
# test_df['total_sent_num'] = (test_df['total_sent_num'] - train_df['total_sent_num'].min()) / (train_df['total_sent_num'].max() - train_df['total_sent_num'].min())
# test_df['avg_dialogue_speed'] = (test_df['avg_dialogue_speed'] - train_df['avg_dialogue_speed'].min()) / (train_df['avg_dialogue_speed'].max() - train_df['avg_dialogue_speed'].min())
# test_df['avg_word_per_sent'] = (test_df['avg_word_per_sent'] - train_df['avg_word_per_sent'].min()) / (train_df['avg_word_per_sent'].max() - train_df['avg_word_per_sent'].min())

# test_df

In [None]:
test_df.describe()

Unnamed: 0,total_sent_num,avg_dialogue_speed,avg_word_per_sent,ANG,DIS,FEA,HAP,ND,NEU,SAD,SUR,disaster,scifi,action,horror,romance,comedy,spy,martialarts,adventure
count,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
mean,1433.136364,2.684366,26.143137,9.818182,0.090909,15.136364,16.136364,184.545455,10.454545,25.181818,2.545455,0.090909,0.272727,0.318182,0.181818,0.136364,0.363636,0.090909,0.090909,0.136364
std,522.003858,0.399835,3.47637,8.781129,0.294245,8.37397,14.337083,44.36078,9.038159,12.408278,5.535404,0.294245,0.455842,0.476731,0.394771,0.35125,0.492366,0.294245,0.294245,0.35125
min,533.0,1.842064,21.508006,1.0,0.0,3.0,2.0,109.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1139.5,2.316996,23.43152,4.0,0.0,10.0,7.5,148.25,3.25,18.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1437.0,2.823717,25.320544,6.5,0.0,13.5,12.5,177.0,9.5,27.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1736.5,2.975908,27.469319,12.75,0.0,22.75,15.5,231.25,13.75,31.75,2.0,0.0,0.75,1.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2498.0,3.243094,33.002782,38.0,1.0,31.0,62.0,244.0,35.0,55.0,26.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
labels_test = np.array(test_df.iloc[:,12:])
labels_test[:5]

array([[0, 0, 0, 0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0]])

In [None]:
# test_features = test_df[['total_sent_num','avg_dialogue_speed','avg_word_per_sent']]
# test_features['total_sent_num'] = test_features['total_sent_num'].apply(lambda x:x/1000)
# test_features['avg_word_per_sent'] = test_features['avg_word_per_sent'].apply(lambda x:x/10)
# test_features = np.array(test_features)
# print(test_features.shape)
# test_features

In [None]:
features_test = test_df.columns[1:11]
features_test = test_df[features_test]

# feature_list = list(features.columns)
features_test = np.array(features_test)

features_test[:5]

array([[1.86100000e+03, 2.31645701e+00, 2.33729178e+01, 1.70000000e+01,
        1.00000000e+00, 2.60000000e+01, 1.30000000e+01, 2.20000000e+02,
        1.90000000e+01, 3.50000000e+01],
       [2.28200000e+03, 2.83132649e+00, 2.21446100e+01, 2.00000000e+01,
        0.00000000e+00, 2.40000000e+01, 2.30000000e+01, 2.40000000e+02,
        1.00000000e+01, 3.90000000e+01],
       [1.25100000e+03, 2.31861306e+00, 2.28145484e+01, 2.10000000e+01,
        0.00000000e+00, 1.60000000e+01, 1.30000000e+01, 1.52000000e+02,
        1.00000000e+01, 3.30000000e+01],
       [1.22800000e+03, 2.26246956e+00, 2.74910424e+01, 1.20000000e+01,
        0.00000000e+00, 2.50000000e+01, 1.30000000e+01, 2.37000000e+02,
        9.00000000e+00, 2.30000000e+01],
       [9.93000000e+02, 3.09827550e+00, 3.17673716e+01, 4.00000000e+00,
        0.00000000e+00, 3.00000000e+00, 5.00000000e+00, 2.35000000e+02,
        2.00000000e+00, 2.30000000e+01]])

In [None]:
predict = grid_search.predict(features_test)

In [None]:
def accuracy(pred):
  count = 0
  for i in range(len(pred)):
    value = 0
          
    first2_index = np.argsort(pred[i])[-2:]
    correct = np.where(labels_test[i] == 1)[0]
    for j in first2_index:
      if j in correct:
        value += 1
                  
    if (value>0):
      count=count+1
    
  print("Accuracy = ", count/len(pred))

In [None]:
print(accuracy(predict))

Accuracy =  0.6363636363636364
None


In [None]:
title = test_df['movie']

In [None]:
# HAIII gua ganti disini, so run this plz tengs
classes = ['disaster','scifi','adventure','comedy','action','horror','romance','spy','martialarts']
probs = {cs:[] for cs in classes}
probs['movie'] = []

for i in range(len(title)):
  print(title[i])
  probs['movie'].append(title[i])
  for j in range(9):
    print(classes[j], predict[i][j])
    probs[classes[j]].append(predict[i][j])
  print()

jumanjiwelcometothejungle
disaster 0.13270950108340565
scifi 0.19157647283809834
adventure 0.15732143978161045
comedy 0.1181867825052677
action 0.22012883220266916
horror 0.702290660422825
romance 0.0698398540567482
spy 0.03805370468357684
martialarts 0.06973573695750018

paul
disaster 0.13156366775007233
scifi 0.18579522283809832
adventure 0.1594047731149438
comedy 0.11849928250526771
action 0.21908716553600247
horror 0.706196910422825
romance 0.07077735405674819
spy 0.03826203801691017
martialarts 0.06937115362416683

shaolinsoccer
disaster 0.07412666598073049
scifi 0.3006855606252211
adventure 0.4153070949359222
comedy 0.2224929089017667
action 0.09895627270076751
horror 0.17578688334963097
romance 0.1004846488395698
spy 0.11280657899758266
martialarts 0.16676061975254558

spectre
disaster 0.07412666598073049
scifi 0.3006855606252211
adventure 0.4153070949359222
comedy 0.2224929089017667
action 0.09895627270076751
horror 0.17578688334963097
romance 0.1004846488395698
spy 0.112806578

In [None]:
# here too

result = pd.DataFrame(probs)
result

Unnamed: 0,disaster,scifi,adventure,comedy,action,horror,romance,spy,martialarts,movie
0,0.13271,0.191576,0.157321,0.118187,0.220129,0.702291,0.06984,0.038054,0.069736,jumanjiwelcometothejungle
1,0.131564,0.185795,0.159405,0.118499,0.219087,0.706197,0.070777,0.038262,0.069371,paul
2,0.074127,0.300686,0.415307,0.222493,0.098956,0.175787,0.100485,0.112807,0.166761,shaolinsoccer
3,0.074127,0.300686,0.415307,0.222493,0.098956,0.175787,0.100485,0.112807,0.166761,spectre
4,0.069731,0.298405,0.413916,0.214985,0.094681,0.194762,0.114903,0.113625,0.162069,sputnik
5,0.131564,0.185795,0.159405,0.118499,0.219087,0.706197,0.070777,0.038262,0.069371,thecore
6,0.069731,0.298405,0.413916,0.214985,0.094681,0.194762,0.114903,0.113625,0.162069,thediscovery
7,0.132034,0.186311,0.16788,0.116111,0.21161,0.695016,0.075147,0.040853,0.071334,theedgeofseventeen
8,0.067003,0.297695,0.399676,0.211618,0.091772,0.20907,0.105999,0.149374,0.158125,thefastandthefurioustokyodrift
9,0.067003,0.297695,0.399676,0.211618,0.091772,0.20907,0.105999,0.149374,0.158125,thefinalmaster


In [None]:
cols = list(result.columns)
cols = [cols[-1]] + cols[:-1]
result = result[cols]
result

Unnamed: 0,movie,disaster,scifi,adventure,comedy,action,horror,romance,spy,martialarts
0,jumanjiwelcometothejungle,0.13271,0.191576,0.157321,0.118187,0.220129,0.702291,0.06984,0.038054,0.069736
1,paul,0.131564,0.185795,0.159405,0.118499,0.219087,0.706197,0.070777,0.038262,0.069371
2,shaolinsoccer,0.074127,0.300686,0.415307,0.222493,0.098956,0.175787,0.100485,0.112807,0.166761
3,spectre,0.074127,0.300686,0.415307,0.222493,0.098956,0.175787,0.100485,0.112807,0.166761
4,sputnik,0.069731,0.298405,0.413916,0.214985,0.094681,0.194762,0.114903,0.113625,0.162069
5,thecore,0.131564,0.185795,0.159405,0.118499,0.219087,0.706197,0.070777,0.038262,0.069371
6,thediscovery,0.069731,0.298405,0.413916,0.214985,0.094681,0.194762,0.114903,0.113625,0.162069
7,theedgeofseventeen,0.132034,0.186311,0.16788,0.116111,0.21161,0.695016,0.075147,0.040853,0.071334
8,thefastandthefurioustokyodrift,0.067003,0.297695,0.399676,0.211618,0.091772,0.20907,0.105999,0.149374,0.158125
9,thefinalmaster,0.067003,0.297695,0.399676,0.211618,0.091772,0.20907,0.105999,0.149374,0.158125


In [None]:
result.to_csv('word_statistics_output.csv', index=False)