In [10]:
from pymongo import MongoClient
import base64
import io
from PIL import Image
import sys
sys.path.insert(0, '../img2vec/')
from img2vec import Image2Vec
import threading
from time import time
import logging


username = 'username'
fullname = 'fullname'
biography = 'biography'
followed_by = 'followedby'
follow = 'follow'
is_business_account = 'isbusinessaccount'
is_joined_recently = 'isjoinedrecently'
business_category_name = 'businesscategoryname'
business_category_name = 'categoryid'
is_private = 'isprivate'
is_verified = 'isverified'
has_connected_fb = 'connectedfbpage'
profile_pic = 'profilepic'
profile_pic_url = 'profilepicurl'
posts_count = 'postscount'
posts = 'posts'
image = 'image'
profile_pic_embedding = 'profilepicembedding'
image_embedding = 'profilepicembedding'


def to_pil(raw):
    image_buf = io.BytesIO()
    base64.decode(io.StringIO(raw), image_buf)
    return Image.open(image_buf)


def extract_post_pics(profile):
    return [to_pil(post[image]) for post in profile[posts]]


def add_embeddings(img2vec, profile):
    if profile_pic in profile:
        profile[profile_pic_embedding] = img2vec.apply_single(to_pil(profile[profile_pic]))
    
    if posts in profile and len(profile[posts]) > 0:
        for post in profile[posts]:
            if image in post:
                post[image_embedding] = img2vec.apply_single(to_pil(post[image])).tolist()


def setup_logging():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.WARNING)

    file_handler = logging.FileHandler('instagramAuditor.log')
    file_handler.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)-15s - %(levelname)-5s - %(message)s')
    console_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)

    return logger


def setup_connection(logger):

    SERVER_IP = "213.136.94.240"
    USERNAME = "admin"
    PASSWORD = "04061997"

    client = MongoClient(host=SERVER_IP, username=USERNAME, password=PASSWORD)
    print("Client established")

    db = client.instagramAuditor
    profiles = db.profiles
    print("Collection retrieved")
    
    return profiles


def setup_img2vec(logger):
    img2vec = Image2Vec()
    print("Image2vec instance created")

    return img2vec


def vectorize_images(cursor, img2vec, logger):
    start_time = time()
    cnt = 0
    errors_in_profiles = []
    bulkList = []
    print("started")
    for profile in cursor.batch_size(200):
        try:
            print(cnt, "start")
            add_embeddings(img2vec, profile)
            print(cnt, "added_embeddings")
            bulkList.append(UpdateOne({'_id': profile['_id']}, 
                {'$set': {
                        profile_pic_embedding: profile[profile_pic_embedding].tolist(), 
                        posts: profile[posts]
                    }
                },
                upsert=True))
            #profiles.update_one({'_id': profile['_id']}, 
            #    {'$set': {
            #            profile_pic_embedding: profile[profile_pic_embedding].tolist(), 
            #            posts: profile[posts]
            #        }
            #    },
            #    upsert=True)
            print(cnt, "updated")
        except Exception as e:
            logger.exception(e)
            logger.debug("Profile: ", profile)
            errors_in_profiles.append(profile[username])
        finally:
            cnt += 1
            level = logging.WARNING if cnt % 100 == 0 else logging.DEBUG
            
            if(cnt % 100 == 0):
                print("executed bulk")
                result = profiles.bulk_write(bulkList)
                bulkList = []
            
            print(level, f"Total elapsed time: {time() - start_time}")
            print(level, f"Documents processed: {cnt}")
            


def for_each_profile(do, profiles, img2vec, logger):
    do(profiles.find(), img2vec, logger)


if __name__ == "__main__":
    logger = setup_logging()
    profiles = setup_connection(logger)
    img2vec = setup_img2vec(logger)

    for_each_profile(do=vectorize_images, profiles=profiles, img2vec=img2vec, logger=logger)


Client established
Collection retrieved




Image2vec instance created
started
0 start
0 added_embeddings
0 updated
10 Total elapsed time: 117.39546942710876
10 Documents processed: 1
1 start
1 added_embeddings
1 updated
10 Total elapsed time: 118.12850952148438
10 Documents processed: 2
2 start
2 added_embeddings
2 updated
10 Total elapsed time: 118.83860993385315
10 Documents processed: 3
3 start
3 added_embeddings
3 updated
10 Total elapsed time: 119.56068015098572
10 Documents processed: 4
4 start
4 added_embeddings
4 updated
10 Total elapsed time: 120.30170130729675
10 Documents processed: 5
5 start
5 added_embeddings
5 updated
10 Total elapsed time: 121.02975368499756
10 Documents processed: 6
6 start
6 added_embeddings
6 updated
10 Total elapsed time: 121.75581216812134
10 Documents processed: 7
7 start
7 added_embeddings
7 updated
10 Total elapsed time: 122.47090220451355
10 Documents processed: 8
8 start
8 added_embeddings
8 updated
10 Total elapsed time: 122.53672480583191
10 Documents processed: 9
9 start
9 added_embe

76 added_embeddings
76 updated
10 Total elapsed time: 169.69667148590088
10 Documents processed: 77
77 start
77 added_embeddings
77 updated
10 Total elapsed time: 170.42373037338257
10 Documents processed: 78
78 start
78 added_embeddings
78 updated
10 Total elapsed time: 170.48955154418945
10 Documents processed: 79
79 start
79 added_embeddings
79 updated
10 Total elapsed time: 171.22259211540222
10 Documents processed: 80
80 start
80 added_embeddings
80 updated
10 Total elapsed time: 171.9596221446991
10 Documents processed: 81
81 start
81 added_embeddings
81 updated
10 Total elapsed time: 172.686678647995
10 Documents processed: 82
82 start
82 added_embeddings
82 updated
10 Total elapsed time: 173.40276455879211
10 Documents processed: 83
83 start
83 added_embeddings
83 updated
10 Total elapsed time: 174.124835729599
10 Documents processed: 84
84 start
84 added_embeddings
84 updated
10 Total elapsed time: 174.8379294872284
10 Documents processed: 85
85 start
85 added_embeddings
85 up

150 added_embeddings
150 updated
10 Total elapsed time: 375.867956161499
10 Documents processed: 151
151 start
151 added_embeddings
151 updated
10 Total elapsed time: 376.54414916038513
10 Documents processed: 152
152 start
152 added_embeddings
152 updated
10 Total elapsed time: 377.18742775917053
10 Documents processed: 153
153 start
153 added_embeddings
153 updated
10 Total elapsed time: 377.83469796180725
10 Documents processed: 154
154 start
154 added_embeddings
154 updated
10 Total elapsed time: 378.4919424057007
10 Documents processed: 155
155 start
155 added_embeddings
155 updated
10 Total elapsed time: 379.12125968933105
10 Documents processed: 156
156 start
156 added_embeddings
156 updated
10 Total elapsed time: 379.7336232662201
10 Documents processed: 157
157 start
157 added_embeddings
157 updated
10 Total elapsed time: 380.37291502952576
10 Documents processed: 158
158 start


2020-06-20 01:11:13,693 - ERROR - output with shape [1, 224, 224] doesn't match the broadcast shape [3, 224, 224]
Traceback (most recent call last):
  File "<ipython-input-10-ad55386d0a82>", line 109, in vectorize_images
    add_embeddings(img2vec, profile)
  File "<ipython-input-10-ad55386d0a82>", line 46, in add_embeddings
    profile[profile_pic_embedding] = img2vec.apply_single(to_pil(profile[profile_pic]))
  File "../img2vec\img2vec.py", line 22, in apply_single
    image_tensor = self.normalize(self.to_tensor(self.scaler(img))).unsqueeze(0).to(self.device)
  File "C:\Users\GANSOR-PC\anaconda3\lib\site-packages\torchvision\transforms\transforms.py", line 166, in __call__
    return F.normalize(tensor, self.mean, self.std, self.inplace)
  File "C:\Users\GANSOR-PC\anaconda3\lib\site-packages\torchvision\transforms\functional.py", line 208, in normalize
    tensor.sub_(mean).div_(std)
RuntimeError: output with shape [1, 224, 224] doesn't match the broadcast shape [3, 224, 224]
2020-0

158 added_embeddings
158 updated
10 Total elapsed time: 381.0072250366211
10 Documents processed: 159
159 start
10 Total elapsed time: 381.03414607048035
10 Documents processed: 160
160 start
160 added_embeddings
160 updated
10 Total elapsed time: 381.73128366470337
10 Documents processed: 161
161 start
161 added_embeddings
161 updated
10 Total elapsed time: 382.35661220550537
10 Documents processed: 162
162 start
162 added_embeddings
162 updated
10 Total elapsed time: 383.02285146713257
10 Documents processed: 163
163 start
163 added_embeddings
163 updated
10 Total elapsed time: 383.6840624809265
10 Documents processed: 164
164 start
164 added_embeddings
164 updated
10 Total elapsed time: 384.32136034965515
10 Documents processed: 165
165 start
165 added_embeddings
165 updated
10 Total elapsed time: 384.92574524879456
10 Documents processed: 166
166 start
166 added_embeddings
166 updated
10 Total elapsed time: 385.5710189342499
10 Documents processed: 167
167 start
167 added_embedding

KeyboardInterrupt: 

In [16]:
import torch
from torchvision import models
from torchvision import transforms


torch.device('cuda')
#torch.device(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

device(type='cuda')