In [2]:
# Basic import
import os
import sys
import json
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import cv2

In [3]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

## 2019 movies

In [20]:
PATH = './SplitTrailers_crop/'
trailers = os.listdir(PATH)
trailers.sort()
trailers = trailers[1:]
print(len(trailers), trailers[:10])

97 ['47metersdown', 'adogsjourneymovie', 'aftermathmovie', 'aftermovie', 'alitamovie', 'angelhasfallen', 'angrybirdsmovie', 'annabellemovie', 'annamovie', 'apollo11movie']


## 2018 movies

In [6]:
PATH = './2018SplitTrailers_crop/'
df_onehot = pd.read_csv('./csv/filename2genreMat_2018.csv')
df_onehot

Unnamed: 0,id,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,History,Horror,Music,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt0328810,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,tt10005184,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt10017502,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,tt10043732,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,tt10048096,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,tt9866700,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1401,tt9879080,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1402,tt9891764,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1403,tt9904014,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
trailers = df_onehot['id'].tolist()
print(len(trailers), trailers[:10])

1405 ['tt0328810', 'tt10005184', 'tt10017502', 'tt10043732', 'tt10048096', 'tt10059624', 'tt10077620', 'tt10116528', 'tt10160782', 'tt10178206']


# A_AverageShotLength

In [None]:
A_li = []
for movie in trailers:
    frameCount = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
        framePath = PATH + movie + '/' +scene + '/frames/'
        frameCount.append(len(os.listdir(framePath)))
    
#     print(frameCount)
    avg = np.mean(frameCount)
    std = np.std(frameCount)
    print(avg, std)
    
    A_li.append([avg, std])

In [None]:
A_arr = np.asarray(A_li)
np.save('./npy/2018A_arr.npy', A_arr)
len(A_arr)

# B_ColorVariance
* https://docs.opencv.org/3.4/de/d25/imgproc_color_conversions.html#color_convert_rgb_luv
* https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.det.html

In [None]:
B_li = []
for movie in trailers: #order:
    rho = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
        framePath = PATH + movie + '/' + scene + '/frames/'
        frameList = [int(name.split('.')[0]) for name in os.listdir(framePath) if not name.startswith('.')]
        frameList.sort()
        keyframe = framePath + '/' + str(frameList[len(frameList)//2]) + '.jpg'
        img = cv2.imread(keyframe)
        luv = cv2.cvtColor(img, cv2.COLOR_BGR2Luv)
        l = luv[:,:,0].ravel()
        u = luv[:,:,1].ravel()
        v = luv[:,:,2].ravel()
        x = np.asarray((l, u, v))
        cov = np.cov(x)
        det = np.linalg.det(cov)
        rho.append(det)
    
    avg = np.mean(rho)
    std = np.std(rho)
    print(avg, std)
    
    B_li.append([avg, std])

In [None]:
B_arr = np.asarray(B_li)
np.save('./npy/2018B_arr.npy', B_arr)
len(B_arr)

# C_MotionContent
* https://blog.gtwang.org/programming/opencv-motion-detection-and-tracking-tutorial/

In [None]:
C_li = []
for movie in trailers: # [900:]:
    moving_li = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
    
        video  = PATH + movie + '/' + scene + '/clip.avi'

        cap = cv2.VideoCapture(video)

        w = int(cap.get(3))
        h = int(cap.get(4))

        # 計算畫面面積
        area = w * h

        # 初始化平均影像
        ret, frame = cap.read()

        avg = cv2.blur(frame,(4, 4))
        avg_float = np.float32(avg)
#         plt.imshow(frame)
#         plt.show()

        while(cap.isOpened()):
            # 讀取一幅影格
            ret, frame = cap.read()

            # 若讀取至影片結尾，則跳出
            if ret == False:
                break

            # 模糊處理
            blur = cv2.blur(frame, (4, 4))

            # 計算目前影格與平均影像的差異值
            diff = cv2.absdiff(frame, avg)

            # 將圖片轉為灰階
            gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)

            # 篩選出變動程度大於門檻值的區域
            ret, thresh = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY)

            # 使用型態轉換函數去除雜訊
            kernel = np.ones((5, 5), np.uint8)
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=2)

            # 產生等高線
            cnts, hierarchy = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            moving = 0
            for c in cnts:
                moving += cv2.contourArea(c)
                
            moving_li.append(moving/area)
            
#             for c in cnts:
#                 # 忽略太小的區域
#                 if cv2.contourArea(c) < 1000:
#                     continue


#             # 畫出等高線（除錯用）
#             cv2.drawContours(frame, cnts, -1, (0, 255, 255), 2)
#             plt.imshow(frame)
#             plt.show()

            # 更新平均影像
            cv2.accumulateWeighted(blur, avg_float, 0.01)
            avg = cv2.convertScaleAbs(avg_float)

        cap.release()
        cv2.destroyAllWindows()
    
    avg = np.mean(moving_li)
    std = np.std(moving_li)
    print(avg, std)
    
    C_li.append([avg, std])

In [None]:
C_arr = np.asarray(C_li)
np.save('./npy/2018C_arr.npy', C_arr)
len(C_arr)

# D_LightingKey

In [None]:
D_li = []
for movie in trailers:
    lk = []
    print('Running:', movie)
    
    for scene in os.listdir(PATH + movie):
        framePath = PATH + movie + '/' + scene + '/frames/'
        frameList = [int(name.split('.')[0]) for name in os.listdir(framePath) if not name.startswith('.')]
        frameList.sort()
        keyframe = framePath + '/' + str(frameList[len(frameList)//2]) + '.jpg'
        img = cv2.imread(keyframe)
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        value = hsv[:,:,2].ravel()
        mean = np.mean(value)
        std = np.std(value)
        lk.append(mean*std)

    avg = np.mean(lk)
    std = np.std(lk)
    print(avg, std)
    
    D_li.append([avg, std])

In [None]:
D_arr = np.asarray(D_li)
np.save('./npy/2018D_arr.npy', D_arr)
len(D_arr)

# Results

In [21]:
# reload computed values
A_arr = np.load('./npy/A_arr.npy')
B_arr = np.load('./npy/B_arr.npy')
C_arr = np.load('./npy/C_arr.npy')
D_arr = np.load('./npy/D_arr.npy')

In [22]:
print(A_arr.shape, B_arr.shape, C_arr.shape, D_arr.shape)

(97, 2) (97, 2) (97, 2) (97, 2)


In [23]:
# First time succeed!
results = pd.DataFrame({
    'Movie': trailers,
    'A_avg': A_arr[:,0],
    'A_std': A_arr[:,1],
    'B_avg': B_arr[:,0],
    'B_std': B_arr[:,1],
    'C_avg': C_arr[:,0],
    'C_std': C_arr[:,1],
    'D_avg': D_arr[:,0],
    'D_std': D_arr[:,1]
})
results

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std
0,47metersdown,34.804878,23.842184,1.193778e+07,3.062201e+07,0.245609,0.249743,3832.149222,3285.912168
1,adogsjourneymovie,34.632479,29.109753,2.688809e+06,4.606428e+06,0.335994,0.349169,3225.348525,2086.338316
2,aftermathmovie,35.692308,26.611911,7.378430e+05,1.447868e+06,0.204298,0.259951,3212.997036,2613.398542
3,aftermovie,58.276596,54.904056,5.730162e+06,1.429929e+07,0.212974,0.231655,3180.640515,2106.905407
4,alitamovie,35.784314,29.192272,2.531455e+06,6.295080e+06,0.267326,0.269637,2845.366841,2350.313592
...,...,...,...,...,...,...,...,...,...
92,usmovie,43.010204,52.851833,2.255390e+06,6.467894e+06,0.175808,0.220674,3444.948763,3236.149083
93,whatmenwant,32.086957,19.626320,2.003662e+07,5.276056e+07,0.241500,0.260970,6006.485656,2582.258884
94,wonderparkmovie,27.282443,13.193380,1.579958e+08,4.421705e+08,0.431886,0.331712,5554.107410,2757.092203
95,xmenmovies,41.549296,33.954729,4.791049e+06,1.487502e+07,0.319549,0.279627,2953.689532,2301.564008


In [24]:
results.to_csv('./csv/computed_2019.csv', index = 0)

# Remove invalid movies

In [10]:
with open('./Check2018_id.txt', 'r') as f:
    lines = [line.rstrip() for line in f]

In [11]:
lines

['tt9446774',
 'tt4883336',
 'tt8152842',
 'tt2709692',
 'tt2396557',
 'tt9417976',
 'tt9183176',
 'tt6774588',
 'tt8973954',
 'tt8824506',
 'tt4660378',
 'tt8997108',
 'tt8253812',
 'tt7896034',
 'tt6693892',
 'tt9104922',
 'tt2396557',
 'tt8906732',
 'tt9417976',
 'tt8493136',
 'tt9183176',
 'tt6774588',
 'tt2051958',
 'tt8973954',
 'tt6098808',
 'tt9318706',
 'tt4660378',
 'tt9635722',
 'tt9289116',
 'tt7466770',
 'tt8336974',
 'tt8819182',
 'tt8564100',
 'tt7581080',
 'tt4581774',
 'tt9037262',
 'tt8137788',
 'tt9386390',
 'tt6151592',
 'tt4173184',
 'tt1828172',
 'tt8887736',
 'tt7842870',
 'tt8493136',
 'tt6774588',
 'tt9417976',
 'tt5824110',
 'tt9627094',
 'tt9455514',
 'tt6433034',
 'tt8947034',
 'tt8288798',
 'tt9318706',
 'tt9747894',
 'tt9475908',
 'tt6181262',
 'tt9367778',
 'tt8922582',
 'tt9604418',
 'tt8368346',
 'tt9138208',
 'tt6098808',
 'tt8386692',
 'tt7165654',
 'tt7999860',
 'tt8364460',
 'tt7533756',
 'tt6580062',
 'tt8973954',
 'tt9183176',
 'tt5775536',
 'tt71

In [12]:
filtered = results[~results.Movie.isin(lines)]
filtered

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std
0,tt0328810,74.324324,77.632556,7.429596e+06,6.754840e+06,0.336424,0.335943,8309.913009,4659.362101
1,tt10005184,128.111111,168.813141,1.056263e+06,2.578889e+06,0.099732,0.170803,5180.699115,2028.150559
2,tt10017502,31.970588,14.597512,3.220428e+06,6.070315e+06,0.268349,0.325218,3663.755048,2106.268326
3,tt10043732,41.670732,28.350616,5.971663e+06,1.710160e+07,0.315923,0.330436,4184.885640,3340.768941
4,tt10048096,28.480000,27.728859,1.167823e+07,2.226350e+07,0.296425,0.314294,4229.213104,2424.473662
...,...,...,...,...,...,...,...,...,...
1400,tt9866700,140.034483,81.652114,1.034090e+06,2.018781e+06,0.115961,0.170274,5452.573097,3761.139541
1401,tt9879080,83.636364,51.821850,1.771188e+06,2.408702e+06,0.058649,0.080815,1852.817936,1417.178866
1402,tt9891764,32.716981,19.979119,7.569638e+06,1.308621e+07,0.243142,0.287351,4391.310240,2942.670602
1403,tt9904014,108.547619,73.594838,3.773972e+05,6.156485e+05,0.339983,0.243896,2960.448579,2322.644041


In [13]:
filtered.to_csv('./csv/computed_2018_filtered.csv', index = 0)

## Normalize
* https://scikit-learn.org/stable/modules/preprocessing.html

In [14]:
from sklearn import preprocessing

### MinMaxScaler

In [15]:
def normalizing(data):
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(data)
    print('Data range:', scaler.data_range_)
    return scaler.transform(data)

In [25]:
results = pd.read_csv('./csv/computed_2019.csv')
results

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std
0,47metersdown,34.804878,23.842184,1.193778e+07,3.062201e+07,0.245609,0.249743,3832.149222,3285.912168
1,adogsjourneymovie,34.632479,29.109753,2.688809e+06,4.606428e+06,0.335994,0.349169,3225.348525,2086.338316
2,aftermathmovie,35.692308,26.611911,7.378430e+05,1.447868e+06,0.204298,0.259951,3212.997036,2613.398542
3,aftermovie,58.276596,54.904056,5.730162e+06,1.429929e+07,0.212974,0.231655,3180.640515,2106.905407
4,alitamovie,35.784314,29.192272,2.531455e+06,6.295080e+06,0.267326,0.269637,2845.366841,2350.313592
...,...,...,...,...,...,...,...,...,...
92,usmovie,43.010204,52.851833,2.255390e+06,6.467894e+06,0.175808,0.220674,3444.948763,3236.149083
93,whatmenwant,32.086957,19.626320,2.003662e+07,5.276056e+07,0.241500,0.260970,6006.485656,2582.258884
94,wonderparkmovie,27.282443,13.193380,1.579958e+08,4.421705e+08,0.431886,0.331712,5554.107410,2757.092203
95,xmenmovies,41.549296,33.954729,4.791049e+06,1.487502e+07,0.319549,0.279627,2953.689532,2301.564008


In [26]:
cols = results.columns[1:].tolist()
cols

['A_avg', 'A_std', 'B_avg', 'B_std', 'C_avg', 'C_std', 'D_avg', 'D_std']

In [27]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_minmaxnorm'] = normalizing(arr)

Data range: [109.1367054]
Data range: [168.9261387]
Data range: [3.69648232e+08]
Data range: [7.50683084e+08]
Data range: [0.33941702]
Data range: [0.24881802]
Data range: [6527.36849728]
Data range: [2541.56075336]


In [28]:
results.to_csv('./csv/computed_minmaxnorm_2019.csv', index = 0)
results

Unnamed: 0,Movie,A_avg,A_std,B_avg,B_std,C_avg,C_std,D_avg,D_std,A_avg_minmaxnorm,A_std_minmaxnorm,B_avg_minmaxnorm,B_std_minmaxnorm,C_avg_minmaxnorm,C_std_minmaxnorm,D_avg_minmaxnorm,D_std_minmaxnorm
0,47metersdown,34.804878,23.842184,1.193778e+07,3.062201e+07,0.245609,0.249743,3832.149222,3285.912168,0.071101,0.066649,0.031572,0.040065,0.451185,0.471811,0.377956,0.831365
1,adogsjourneymovie,34.632479,29.109753,2.688809e+06,4.606428e+06,0.335994,0.349169,3225.348525,2086.338316,0.069522,0.097831,0.006551,0.005409,0.717482,0.871405,0.284994,0.359382
2,aftermathmovie,35.692308,26.611911,7.378430e+05,1.447868e+06,0.204298,0.259951,3212.997036,2613.398542,0.079233,0.083045,0.001273,0.001202,0.329474,0.512840,0.283102,0.566759
3,aftermovie,58.276596,54.904056,5.730162e+06,1.429929e+07,0.212974,0.231655,3180.640515,2106.905407,0.286168,0.250527,0.014779,0.018321,0.355036,0.399117,0.278145,0.367474
4,alitamovie,35.784314,29.192272,2.531455e+06,6.295080e+06,0.267326,0.269637,2845.366841,2350.313592,0.080076,0.098320,0.006125,0.007659,0.515171,0.551766,0.226780,0.463246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,usmovie,43.010204,52.851833,2.255390e+06,6.467894e+06,0.175808,0.220674,3444.948763,3236.149083,0.146285,0.238378,0.005378,0.007889,0.245535,0.354984,0.318637,0.811786
93,whatmenwant,32.086957,19.626320,2.003662e+07,5.276056e+07,0.241500,0.260970,6006.485656,2582.258884,0.046198,0.041692,0.053481,0.069556,0.439081,0.516934,0.711067,0.554507
94,wonderparkmovie,27.282443,13.193380,1.579958e+08,4.421705e+08,0.431886,0.331712,5554.107410,2757.092203,0.002175,0.003610,0.426699,0.588297,1.000000,0.801244,0.641762,0.623296
95,xmenmovies,41.549296,33.954729,4.791049e+06,1.487502e+07,0.319549,0.279627,2953.689532,2301.564008,0.132899,0.126512,0.012238,0.019088,0.669030,0.591916,0.243375,0.444065


### StandardScaler

In [None]:
def normalizing(data):
    scaler = preprocessing.StandardScaler()
    scaler.fit(data)
    print('Mean:', scaler.mean_)
    print('Scale:', scaler.scale_)
    return scaler.transform(data)

In [None]:
results = pd.read_csv('./csv/computed_2018.csv')
results

In [None]:
cols = results.columns[1:].tolist()
cols

In [None]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_stdnorm'] = normalizing(arr)

In [None]:
results.to_csv('./csv/computed_stdnorm_2018.csv', index = 0)
results

### MaxAbsScaler

In [None]:
def normalizing(data):
    scaler = preprocessing.MaxAbsScaler()
    scaler.fit(data)
    print('Scale:', scaler.scale_)
    return scaler.transform(data)

In [None]:
results = pd.read_csv('./csv/computed_2018.csv')
results

In [None]:
cols = results.columns[1:].tolist()
cols

In [None]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_maxabsnorm'] = normalizing(arr)

In [None]:
results.to_csv('./csv/computed_maxabsnorm_2018.csv', index = 0)
results

### RobustScaler

In [None]:
def normalizing(data):
    scaler = preprocessing.RobustScaler()
    scaler.fit(data)
    print('Scale:', scaler.scale_)
    return scaler.transform(data)

In [None]:
results = pd.read_csv('./csv/computed_2018.csv')
results

In [None]:
cols = results.columns[1:].tolist()
cols

In [None]:
for col in cols:
    arr = np.asarray(results[col].tolist()).reshape(-1, 1)
    results[col+'_robnorm'] = normalizing(arr)

In [None]:
results.to_csv('./csv/computed_robnorm_2018.csv', index = 0)
results