In [1]:
# Basic import
import os
import sys
import json
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import cv2

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

## 2019 movies

In [6]:
PATH = './SplitTrailers_crop/'
trailers = os.listdir(PATH)
trailers.sort()
trailers = trailers[1:]
print(len(trailers), trailers[:10])

97 ['47metersdown', 'adogsjourneymovie', 'aftermathmovie', 'aftermovie', 'alitamovie', 'angelhasfallen', 'angrybirdsmovie', 'annabellemovie', 'annamovie', 'apollo11movie']


## 2018 movies

In [16]:
PATH = './2018SplitTrailers_crop/'
df_onehot = pd.read_csv('./csv/filename2genreMat_2018.csv')
df_onehot

Unnamed: 0,id,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,History,Horror,Music,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt0328810,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,tt10005184,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt10017502,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,tt10043732,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,tt10048096,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,tt9866700,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1401,tt9879080,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1402,tt9891764,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1403,tt9904014,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
trailers = df_onehot['id'].tolist()
print(len(trailers), trailers[:10])

1405 ['tt0328810', 'tt10005184', 'tt10017502', 'tt10043732', 'tt10048096', 'tt10059624', 'tt10077620', 'tt10116528', 'tt10160782', 'tt10178206']


# A_AverageShotLength

In [None]:
A_li = []
for movie in trailers:
    frameCount = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
        framePath = PATH + movie + '/' +scene + '/frames/'
        frameCount.append(len(os.listdir(framePath)))
    
#     print(frameCount)
    avg = np.mean(frameCount)
    std = np.std(frameCount)
    print(avg, std)
    
    A_li.append([avg, std])

In [None]:
A_arr = np.asarray(A_li)
np.save('./npy/2018A_arr.npy', A_arr)
len(A_arr)

# B_ColorVariance
* https://docs.opencv.org/3.4/de/d25/imgproc_color_conversions.html#color_convert_rgb_luv
* https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.det.html

In [None]:
B_li = []
for movie in trailers: #order:
    rho = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
        framePath = PATH + movie + '/' + scene + '/frames/'
        frameList = [int(name.split('.')[0]) for name in os.listdir(framePath) if not name.startswith('.')]
        frameList.sort()
        keyframe = framePath + '/' + str(frameList[len(frameList)//2]) + '.jpg'
        img = cv2.imread(keyframe)
        luv = cv2.cvtColor(img, cv2.COLOR_BGR2Luv)
        l = luv[:,:,0].ravel()
        u = luv[:,:,1].ravel()
        v = luv[:,:,2].ravel()
        x = np.asarray((l, u, v))
        cov = np.cov(x)
        det = np.linalg.det(cov)
        rho.append(det)
    
    avg = np.mean(rho)
    std = np.std(rho)
    print(avg, std)
    
    B_li.append([avg, std])

In [None]:
B_arr = np.asarray(B_li)
np.save('./npy/2018B_arr.npy', B_arr)
len(B_arr)

# C_MotionContent
* https://blog.gtwang.org/programming/opencv-motion-detection-and-tracking-tutorial/

In [None]:
C_li = []
for movie in trailers: # [900:]:
    moving_li = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
    
        video  = PATH + movie + '/' + scene + '/clip.avi'

        cap = cv2.VideoCapture(video)

        w = int(cap.get(3))
        h = int(cap.get(4))

        # 計算畫面面積
        area = w * h

        # 初始化平均影像
        ret, frame = cap.read()

        avg = cv2.blur(frame,(4, 4))
        avg_float = np.float32(avg)
#         plt.imshow(frame)
#         plt.show()

        while(cap.isOpened()):
            # 讀取一幅影格
            ret, frame = cap.read()

            # 若讀取至影片結尾，則跳出
            if ret == False:
                break

            # 模糊處理
            blur = cv2.blur(frame, (4, 4))

            # 計算目前影格與平均影像的差異值
            diff = cv2.absdiff(frame, avg)

            # 將圖片轉為灰階
            gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)

            # 篩選出變動程度大於門檻值的區域
            ret, thresh = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY)

            # 使用型態轉換函數去除雜訊
            kernel = np.ones((5, 5), np.uint8)
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=2)

            # 產生等高線
            cnts, hierarchy = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            moving = 0
            for c in cnts:
                moving += cv2.contourArea(c)
                
            moving_li.append(moving/area)
            
#             for c in cnts:
#                 # 忽略太小的區域
#                 if cv2.contourArea(c) < 1000:
#                     continue


#             # 畫出等高線（除錯用）
#             cv2.drawContours(frame, cnts, -1, (0, 255, 255), 2)
#             plt.imshow(frame)
#             plt.show()

            # 更新平均影像
            cv2.accumulateWeighted(blur, avg_float, 0.01)
            avg = cv2.convertScaleAbs(avg_float)

        cap.release()
        cv2.destroyAllWindows()
    
    avg = np.mean(moving_li)
    std = np.std(moving_li)
    print(avg, std)
    
    C_li.append([avg, std])

In [None]:
C_arr = np.asarray(C_li)
np.save('./npy/2018C_arr.npy', C_arr)
len(C_arr)

# D_LightingKey

In [None]:
D_li = []
for movie in trailers:
    lk = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
        framePath = PATH + movie + '/' + scene + '/frames/'
        frameList = [int(name.split('.')[0]) for name in os.listdir(framePath) if not name.startswith('.')]
        frameList.sort()
        keyframe = framePath + '/' + str(frameList[len(frameList)//2]) + '.jpg'
        img = cv2.imread(keyframe)
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        value = hsv[:,:,2].ravel()
        mean = np.mean(value)
        std = np.std(value)
        lk.append(mean*std)

    avg = np.mean(lk)
    std = np.std(lk)
    print(avg, std)
    
    D_li.append([avg, std])

In [None]:
D_arr = np.asarray(D_li)
np.save('./npy/2018D_arr.npy', D_arr)
len(D_arr)

# Results

In [18]:
# reload computed values
A_arr = np.load('./npy/2018A_arr.npy')
B_arr = np.load('./npy/2018B_arr.npy')
C_arr = np.load('./npy/2018C_arr.npy')
D_arr = np.load('./npy/2018D_arr.npy')

In [19]:
print(A_arr.shape, B_arr.shape, C_arr.shape, D_arr.shape)

(1405, 2) (1405, 2) (1405, 2) (1405, 2)


In [20]:
# First time succeed!
results = pd.DataFrame({
    'Movie': trailers,
    'A_avg': np.log(A_arr[:,0]),
    'A_std': np.log(A_arr[:,1]),
    'B_avg': np.log(B_arr[:,0]),
    'B_std': np.log(B_arr[:,1]),
    'C_avg': C_arr[:,0],
    'C_std': C_arr[:,1],
    'D_avg': np.log(D_arr[:,0]),
    'D_std': np.log(D_arr[:,1])
})
results

  """
  
  import sys
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std
0,tt0328810,4.308438,4.351987,15.820982,15.725770,0.336424,0.335943,9.025204,8.446634
1,tt10005184,4.852898,5.128792,13.870248,14.762869,0.099732,0.170803,8.552695,7.614880
2,tt10017502,3.464816,2.680851,14.985025,15.618921,0.268349,0.325218,8.206244,7.652673
3,tt10043732,3.729799,3.344649,15.602536,16.654683,0.315923,0.330436,8.339235,8.113956
4,tt10048096,3.349202,3.322474,16.273237,16.918459,0.296425,0.314294,8.349771,7.793370
...,...,...,...,...,...,...,...,...,...
1400,tt9866700,4.941889,4.402468,13.849033,14.518005,0.115961,0.170274,8.603843,8.232477
1401,tt9879080,4.426478,3.947812,14.387161,14.694599,0.058649,0.080815,7.524463,7.256423
1402,tt9891764,3.487894,2.994688,15.839656,16.387069,0.243142,0.287351,8.387383,7.987073
1403,tt9904014,4.687189,4.298575,12.841054,13.330432,0.339983,0.243896,7.993096,7.750461


In [21]:
results.to_csv('./csv/computed_2018_log.csv', index = 0)

# Remove invalid movies

In [22]:
with open('./Check2018_id.txt', 'r') as f:
    lines = [line.rstrip() for line in f]

In [23]:
lines

['tt9446774',
 'tt4883336',
 'tt8152842',
 'tt2709692',
 'tt2396557',
 'tt9417976',
 'tt9183176',
 'tt6774588',
 'tt8973954',
 'tt8824506',
 'tt4660378',
 'tt8997108',
 'tt8253812',
 'tt7896034',
 'tt6693892',
 'tt9104922',
 'tt2396557',
 'tt8906732',
 'tt9417976',
 'tt8493136',
 'tt9183176',
 'tt6774588',
 'tt2051958',
 'tt8973954',
 'tt6098808',
 'tt9318706',
 'tt4660378',
 'tt9635722',
 'tt9289116',
 'tt7466770',
 'tt8336974',
 'tt8819182',
 'tt8564100',
 'tt7581080',
 'tt4581774',
 'tt9037262',
 'tt8137788',
 'tt9386390',
 'tt6151592',
 'tt4173184',
 'tt1828172',
 'tt8887736',
 'tt7842870',
 'tt8493136',
 'tt6774588',
 'tt9417976',
 'tt5824110',
 'tt9627094',
 'tt9455514',
 'tt6433034',
 'tt8947034',
 'tt8288798',
 'tt9318706',
 'tt9747894',
 'tt9475908',
 'tt6181262',
 'tt9367778',
 'tt8922582',
 'tt9604418',
 'tt8368346',
 'tt9138208',
 'tt6098808',
 'tt8386692',
 'tt7165654',
 'tt7999860',
 'tt8364460',
 'tt7533756',
 'tt6580062',
 'tt8973954',
 'tt9183176',
 'tt5775536',
 'tt71

In [24]:
filtered = results[~results.Movie.isin(lines)]
filtered

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std
0,tt0328810,4.308438,4.351987,15.820982,15.725770,0.336424,0.335943,9.025204,8.446634
1,tt10005184,4.852898,5.128792,13.870248,14.762869,0.099732,0.170803,8.552695,7.614880
2,tt10017502,3.464816,2.680851,14.985025,15.618921,0.268349,0.325218,8.206244,7.652673
3,tt10043732,3.729799,3.344649,15.602536,16.654683,0.315923,0.330436,8.339235,8.113956
4,tt10048096,3.349202,3.322474,16.273237,16.918459,0.296425,0.314294,8.349771,7.793370
...,...,...,...,...,...,...,...,...,...
1400,tt9866700,4.941889,4.402468,13.849033,14.518005,0.115961,0.170274,8.603843,8.232477
1401,tt9879080,4.426478,3.947812,14.387161,14.694599,0.058649,0.080815,7.524463,7.256423
1402,tt9891764,3.487894,2.994688,15.839656,16.387069,0.243142,0.287351,8.387383,7.987073
1403,tt9904014,4.687189,4.298575,12.841054,13.330432,0.339983,0.243896,7.993096,7.750461


In [25]:
filtered.to_csv('./csv/computed_2018_filtered_log.csv', index = 0)

## Normalize
* https://scikit-learn.org/stable/modules/preprocessing.html

In [26]:
from sklearn import preprocessing

### MinMaxScaler

In [27]:
def normalizing(data):
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(data)
    print('Data range:', scaler.data_range_)
    return scaler.transform(data)

In [28]:
results = pd.read_csv('./csv/computed_2018_filtered_log.csv')
results

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std
0,tt0328810,4.308438,4.351987,15.820982,15.725770,0.336424,0.335943,9.025204,8.446634
1,tt10005184,4.852898,5.128792,13.870248,14.762869,0.099732,0.170803,8.552695,7.614880
2,tt10017502,3.464816,2.680851,14.985025,15.618921,0.268349,0.325218,8.206244,7.652673
3,tt10043732,3.729799,3.344649,15.602536,16.654683,0.315923,0.330436,8.339235,8.113956
4,tt10048096,3.349202,3.322474,16.273237,16.918459,0.296425,0.314294,8.349771,7.793370
...,...,...,...,...,...,...,...,...,...
1315,tt9866700,4.941889,4.402468,13.849033,14.518005,0.115961,0.170274,8.603843,8.232477
1316,tt9879080,4.426478,3.947812,14.387161,14.694599,0.058649,0.080815,7.524463,7.256423
1317,tt9891764,3.487894,2.994688,15.839656,16.387069,0.243142,0.287351,8.387383,7.987073
1318,tt9904014,4.687189,4.298575,12.841054,13.330432,0.339983,0.243896,7.993096,7.750461


In [29]:
cols = results.columns[1:].tolist()
cols

['A_avg', 'A_std', 'B_avg', 'B_std', 'C_avg', 'C_std', 'D_avg', 'D_std']

In [30]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_minmaxnorm'] = normalizing(arr)

Data range: [3.89510653]
Data range: [5.11494908]
Data range: [29.98277174]
Data range: [31.97425115]
Data range: [0.72767469]
Data range: [0.37064379]
Data range: [3.8498195]
Data range: [2.83693654]


In [31]:
results.to_csv('./csv/computed_minmaxnorm_2018_log.csv', index = 0)
results

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std,A_avg_minmaxnorm,A_std_minmaxnorm,B_avg_minmaxnorm,B_std_minmaxnorm,C_avg_minmaxnorm,C_std_minmaxnorm,D_avg_minmaxnorm,D_std_minmaxnorm
0,tt0328810,4.308438,4.351987,15.820982,15.725770,0.336424,0.335943,9.025204,8.446634,0.304127,0.385545,0.836973,0.781866,0.424876,0.817968,0.941671,0.904347
1,tt10005184,4.852898,5.128792,13.870248,14.762869,0.099732,0.170803,8.552695,7.614880,0.443907,0.537414,0.771912,0.751751,0.099603,0.372418,0.818936,0.611160
2,tt10017502,3.464816,2.680851,14.985025,15.618921,0.268349,0.325218,8.206244,7.652673,0.087542,0.058829,0.809092,0.778524,0.331324,0.789032,0.728944,0.624481
3,tt10043732,3.729799,3.344649,15.602536,16.654683,0.315923,0.330436,8.339235,8.113956,0.155571,0.188605,0.829688,0.810918,0.396702,0.803111,0.763489,0.787081
4,tt10048096,3.349202,3.322474,16.273237,16.918459,0.296425,0.314294,8.349771,7.793370,0.057860,0.184269,0.852057,0.819167,0.369907,0.759558,0.766226,0.674076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,tt9866700,4.941889,4.402468,13.849033,14.518005,0.115961,0.170274,8.603843,8.232477,0.466754,0.395414,0.771204,0.744093,0.121906,0.370990,0.832222,0.828858
1316,tt9879080,4.426478,3.947812,14.387161,14.694599,0.058649,0.080815,7.524463,7.256423,0.334431,0.306526,0.789152,0.749616,0.043146,0.129630,0.551850,0.484806
1317,tt9891764,3.487894,2.994688,15.839656,16.387069,0.243142,0.287351,8.387383,7.987073,0.093466,0.120185,0.837596,0.802548,0.296684,0.686867,0.775996,0.742355
1318,tt9904014,4.687189,4.298575,12.841054,13.330432,0.339983,0.243896,7.993096,7.750461,0.401364,0.375102,0.737585,0.706951,0.429766,0.569623,0.673579,0.658951


### StandardScaler

In [None]:
def normalizing(data):
    scaler = preprocessing.StandardScaler()
    scaler.fit(data)
    print('Mean:', scaler.mean_)
    print('Scale:', scaler.scale_)
    return scaler.transform(data)

In [None]:
results = pd.read_csv('./csv/computed_2018.csv')
results

In [None]:
cols = results.columns[1:].tolist()
cols

In [None]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_stdnorm'] = normalizing(arr)

In [None]:
results.to_csv('./csv/computed_stdnorm_2018.csv', index = 0)
results

### MaxAbsScaler

In [None]:
def normalizing(data):
    scaler = preprocessing.MaxAbsScaler()
    scaler.fit(data)
    print('Scale:', scaler.scale_)
    return scaler.transform(data)

In [None]:
results = pd.read_csv('./csv/computed_2018.csv')
results

In [None]:
cols = results.columns[1:].tolist()
cols

In [None]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_maxabsnorm'] = normalizing(arr)

In [None]:
results.to_csv('./csv/computed_maxabsnorm_2018.csv', index = 0)
results

### RobustScaler

In [None]:
def normalizing(data):
    scaler = preprocessing.RobustScaler()
    scaler.fit(data)
    print('Scale:', scaler.scale_)
    return scaler.transform(data)

In [None]:
results = pd.read_csv('./csv/computed_2018.csv')
results

In [None]:
cols = results.columns[1:].tolist()
cols

In [None]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_robnorm'] = normalizing(arr)

In [None]:
results.to_csv('./csv/computed_robnorm_2018.csv', index = 0)
results