In [1]:
# Basic import
import os
import sys
import json
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

# IMDb social feature
* For each movie, we collect the ratings given by the users of IMDb and compute the average rating. Also, we collect the votes obtained on IMDb. Next, we characterize it by concatenating the average rating and the vote together to form a 2-dimensional vector. Similarly, we normalize the values of each dimension into [0,1]. Next, we combine the vectors of social features obtained from both Instagram and IMDb together to derive a feature vector of the movie. Finally, we input the combined vector into a neural network with one embedding layer to derive the social feature vector.

In [3]:
df = pd.read_csv('./ratings.csv')
df

Unnamed: 0,Account,tconst,averageRating,numVotes
0,21bridgesmovie,tt8688634,6.6,18694
1,47metersdown,tt7329656,5.0,12683
2,abeautifuldaymovie,tt3224458,7.4,29979
3,abominablemovie,tt6324278,7.0,18487
4,adastramovie,tt2935510,6.6,147914
...,...,...,...,...
160,wrinklestheclown,tt9097270,4.7,537
161,xmenmovies,tt6565702,5.8,125765
162,yardiefilm,tt5862902,5.8,2248
163,yesterdaymovie,tt8079248,6.9,79700


# Instagram social feature
* To characterize the activeness of a movie on Instagram, for each movie account, we extract the number of followers and the number of posts. The number of follows may reflect how popular the movie is while the number of posts may reflect how often the move company promotes its movie. Also, to measure movie’s recognition, for each movie, we collect the number of likes and the number of comments obtained on Instagram. Thus, we obtain a 4-dimensional vector, where the values of each dimension are normalized into [0,1]. 

In [4]:
import instaloader

In [5]:
accs = df['Account'].tolist()
print(len(accs), accs[:10])

165 ['21bridgesmovie', '47metersdown', 'abeautifuldaymovie', 'abominablemovie', 'adastramovie', 'adogsjourneymovie', 'aftermathmovie', 'aftermovie', 'ahiddenlifefilm', 'alitamovie']


In [6]:
# instaloader instance and login
L = instaloader.Instaloader()
L.login('yuwei8810', 'r07725005') # (login)

In [13]:
# save profile json

uids = []
mediacount = []
followerNum = []
likecount = []
commentcount = []

# 8289987854 captainmarvelofficial
idx = 30

for acc in accs[30:]:
    print(idx)
    profile = instaloader.Profile.from_username(L.context, acc)
    print(profile.userid, profile.username)
    
    likes = 0
    comments = 0
    posts = profile.get_posts()
    for post in posts:
        likes += post.likes
        comments += post.comments
        
    uids.append(profile.userid)
    mediacount.append(profile.mediacount)
    followerNum.append(profile.followers)
    likecount.append(likes)
    commentcount.append(comments)
    
    print(likes, comments)
    print('==============================')
    idx += 1

30
8289987854 captainmarvelofficial
6377731 53019
31
6738026218 captivestatemovie
11838 421
32
8511966686 catsmovie
619396 19052
33
8211108240 charliesangels
3759199 33707
34
8014689569 childsplaymovie
4347231 108697
35
4751500218 clarathefilm
291241 5446
36
8545434395 coldpursuitmovie
9213 407
37
10942513443 countdown
183222 5759
38
14733601666 currentwarmovie
23851 293
39
13564491008 darkwatersmovie


JSON Query to p/B5JZm40JGTk/: HTTP error code 502. [retrying; skip with ^C]


29548 996
40
4956568977 detectivepikachumovie
3138724 38489
41
8693706262 disneyaladdin


KeyboardInterrupt: 

In [None]:
print(len(uids), len(mediacount), len(followerNum), len(likecount), len(commentcount))

In [None]:
write_json([uids, mediacount, followerNum, likecount, commentcount], './temp.json')

In [11]:
i = 30
uids = uids[:i]
mediacount = mediacount[:i]
followerNum = followerNum[:i]

In [None]:
df['Uid'] = uids
df['Posts'] = mediacount
df['Followers'] = followerNum
df['Likes'] = likecount
df['Comments'] = commentcount
df

# Normalize

In [None]:
from sklearn.preprocessing import MinMaxScaler
def normalizing(data):
    scaler = MinMaxScaler()
    scaler.fit(data)
    print('Data max:', scaler.data_max_)
    print('Data min:', scaler.data_min_)
    print('Data range:', scaler.data_range_)
    return scaler.transform(data)

In [None]:
arr = np.asarray(df['averageRating'].tolist()).reshape(-1, 1)
rating = normalizing(arr)
df['averageRating_minmaxnorm'] = rating

In [None]:
arr = np.asarray(df['numVotes'].tolist()).reshape(-1, 1)
vote = normalizing(arr)
df['numVotes_minmaxnorm'] = vote

In [None]:
df