# Instagram Scraping

In [2]:
from pprint import pprint
import pandas as pd
import numpy as np
#import fasttext
import subprocess
import time
from pathlib import Path
import os

## Scrap the account posts

### Run the following commands to install InstaTouch and scrape the data

```bash
$ npm i instatouch
$ 
$ mkdir -p "$(pwd)"/dataset/bbva_mex && \
$ mkdir -p "$(pwd)"/dataset/yosoypyme && \
$ mkdir -p "$(pwd)"/dataset/bbva_seguros && \
$ mkdir -p "$(pwd)"/dataset/bbvauy && \
$ mkdir -p "$(pwd)"/dataset/bbva_peru && \
$ mkdir -p "$(pwd)"/dataset/fundacionbbvape && \
$ mkdir -p "$(pwd)"/dataset/bbva && \
$ mkdir -p "$(pwd)"/dataset/bbvaprovincial && \
$ mkdir -p "$(pwd)"/dataset/bbva_colombia && \
$ mkdir -p "$(pwd)"/dataset/bbvaamcolombia && \
$ mkdir -p "$(pwd)"/dataset/bbva_argentina
$ 
$ instatouch user bbva_mex -c 1000 --filepath "$(pwd)"/dataset/bbva_mex && \
$ instatouch user yosoypyme -c 1000 --filepath "$(pwd)"/dataset/yosoypyme && \
$ instatouch user bbva_seguros -c 1000 --filepath "$(pwd)"/dataset/bbva_seguros && \
$ instatouch user bbvauy -c 1000 --filepath "$(pwd)"/dataset/bbvauy && \
$ instatouch user bbva_peru -c 1000 --filepath "$(pwd)"/dataset/bbva_peru && \
$ instatouch user fundacionbbvape -c 1000 --filepath "$(pwd)"/dataset/fundacionbbvape && \
$ instatouch user bbva -c 1000 --filepath "$(pwd)"/dataset/bbva && \
$ instatouch user bbvaprovincial -c 1000 --filepath "$(pwd)"/dataset/bbvaprovincial && \
$ instatouch user bbva_colombia -c 1000 --filepath "$(pwd)"/dataset/bbva_colombia && \
$ instatouch user bbvaamcolombia -c 1000 --filepath "$(pwd)"/dataset/bbvaamcolombia && \
$ instatouch user bbva_argentina -c 1000 --filepath "$(pwd)"/dataset/bbva_argentina
```

## Check the data

In [30]:
accounts = ["bbva_mex", "yosoypyme", "bbva_seguros", "bbvauy", "bbva_peru", "fundacionbbvape", "bbva", "bbvaprovincial", "bbva_colombia", "bbvaamcolombia", "bbva_argentina"]
#countries = ["mex", "ar", "pe", "col", "uy", "es", "ve", "col"]
account = accounts[0]
#country = countries[0]
csv_name = list(filter(lambda x: '.csv' in x, os.listdir(f'./dataset/{account}/')))[0]
ig_bbva_posts = pd.read_csv(f'./dataset//{account}/{csv_name}')

In [14]:
ig_bbva_posts.head(4)

Unnamed: 0,id,shortcode,type,is_video,dimension.height,dimension.width,display_url,thumbnail_src,owner.id,owner.username,...,comments_disabled,taken_at_timestamp,location,hashtags,mentions,views,location.id,location.has_public_page,location.name,location.slug
0,2421504519680313931,CGa6YftjEpL,GraphSidecar,False,1080,1080,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,336339103,bbva_mex,...,False,1602885855,,"[""#TorreBBVA"",""#CDMX""]",[],,,,,
1,2420053243746800524,CGVwZprD1OM,GraphImage,False,720,1080,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,336339103,bbva_mex,...,False,1602712849,,"[""#BBVAM"",""#CreandoOportunidades"",""#Metas""]",[],,,,,
2,2415614438164888545,CGF_Ih1H3vh,GraphImage,False,1350,1080,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,336339103,bbva_mex,...,False,1602183702,,"[""#TorreBBVA"",""#BBVAMe"",""#CDMX"",""#Di"",""#Nublado""]","[""@vrtypics""]",,,,,
3,2414299946008063669,CGBUQJnDi61,GraphSidecar,False,1080,1080,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,https://instagram.fmex16-1.fna.fbcdn.net/v/t51...,336339103,bbva_mex,...,False,1602027003,,"[""#BBVAM"",""#CreandoOportunidades"",""#ProgramaBB...","[""@museocarrillogil""]",,,,,


In [15]:
ig_bbva_posts.count()

id                          52
shortcode                   52
type                        52
is_video                    52
dimension.height            52
dimension.width             52
display_url                 52
thumbnail_src               52
owner.id                    52
owner.username              52
description                 52
comments                    52
likes                       52
comments_disabled           52
taken_at_timestamp          52
location                     0
hashtags                    52
mentions                    52
views                       12
location.id                  1
location.has_public_page     1
location.name                1
location.slug                1
dtype: int64

## Scrap post comments and likers

In [14]:
accounts = ["yosoypyme", "bbva_seguros", "bbvauy", "bbva_peru", "fundacionbbvape", "bbva", "bbvaprovincial", "bbva_colombia", "bbvaamcolombia", "bbva_argentina"]

def scrape_comments(short_code, account):
    Path(f"./dataset/{account}/comments").mkdir(parents=True, exist_ok=True)
    proc = subprocess.Popen(
        [
            "../node_modules/instatouch/bin/cli.js",
            "comments",
            f"https://instagram.com/p/{short_code}",
            "--filepath",
            f"./dataset/{account}/comments/",
        #    "--proxy-file",
        #    "./proxies.txt"
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    out, err = proc.communicate()
    print(f"[STDERR] {err}")
    print(f"[STDOUT] {out}")

def scrape_likers(short_code, account):
    Path(f"./dataset/{account}/likers").mkdir(parents=True, exist_ok=True)
    proc = subprocess.Popen(
        [
            "../node_modules/instatouch/bin/cli.js",
            "likers",
            f"https://instagram.com/p/{short_code}",
            "--filepath",
            f"./dataset/{account}/likers/",
        #    "--proxy-file",
        #    "./proxies.txt"
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    out, err = proc.communicate()
    print(f"[STDERR] {err}")
    print(f"[STDOUT] {out}")

for account in accounts:
    csv_name = list(filter(lambda x: '.csv' in x, os.listdir(f'./dataset/{account}/')))[0]
    ig_bbva_posts = pd.read_csv(f'./dataset/{account}/{csv_name}')
    for short_code in ig_bbva_posts['shortcode']:
        print(f"Scraping data for: {short_code}")
        scrape_comments(short_code, account)
        time.sleep(3)
        scrape_likers(short_code, account)
        time.sleep(3)
    print(f"[SCRAPED] {account}")


rted\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/comments//comments_1602923635458.csv\n'
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/likers//likers_1602923639162.csv\n'
Scraping data for: CAnhzhBH2I4
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/comments//comments_1602923642783.csv\n'
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/likers//likers_1602923646383.csv\n'
Scraping data for: CAbYioSHoyI
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/comments//comments_1602923650011.csv\n'
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/likers//likers_1602923653699.csv\n'
Scraping data for: CAfrHziHHcz
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT] b'CSV path: ./dataset/bbvaamcolombia/comments//comments_1602923657332.csv\n'
[STDERR] b'- InstaTouch Scraper Started\n'
[STDOUT]

## Join the csv comments

In [66]:
i = 0
for account in accounts:
    for csv_name in os.listdir(f'dataset/{account}/comments/'):
        if i == 0:
            ig_bbva_comments = pd.read_csv(f'dataset/{account}/comments/{csv_name}')
        else:
            ig_bbva_new_comments = pd.read_csv(f'dataset/{account}/comments/{csv_name}')
            ig_bbva_comments = ig_bbva_comments.append(ig_bbva_new_comments, ignore_index=True)
        i += 1
    ig_bbva_comments = ig_bbva_comments.drop_duplicates('text')
    ig_bbva_comments['text'] = ig_bbva_comments['text'].map(lambda x: re.sub(r'[^\w\s]', '', x))
    ig_bbva_comments = ig_bbva_comments.drop(ig_bbva_comments[ig_bbva_comments['text'] == ''].index)
    ig_bbva_comments.to_csv(f'dataset/{account}/{account}_comments.csv')
    i = 0

In [67]:
i = 0
for account in accounts:
    for csv_name in os.listdir(f'dataset/{account}/comments/'):
        if i == 0:
            ig_bbva_comments = pd.read_csv(f'dataset/{account}/comments/{csv_name}')
        else:
            ig_bbva_new_comments = pd.read_csv(f'dataset/{account}/comments/{csv_name}')
            ig_bbva_comments = ig_bbva_comments.append(ig_bbva_new_comments, ignore_index=True)
        i += 1
ig_bbva_comments = ig_bbva_comments.drop_duplicates('text')
ig_bbva_comments.to_csv('./dataset/bbva_comments.csv')
len(ig_bbva_comments)

3407

In [68]:
i = 0
for account in accounts:
    for csv_name in os.listdir(f'dataset/{account}/likers/'):
        if i == 0:
            ig_bbva_likers = pd.read_csv(f'dataset/{account}/likers/{csv_name}')
        else:
            ig_bbva_new_likers = pd.read_csv(f'dataset/{account}/likers/{csv_name}')
            ig_bbva_likers = ig_bbva_likers.append(ig_bbva_new_likers, ignore_index=True)
        i += 1
    ig_bbva_likers = ig_bbva_likers.drop_duplicates('username')
    ig_bbva_likers.to_csv(f'dataset/{account}/{account}_likers.csv')
    i = 0

In [69]:
i = 0
for account in accounts:
    for csv_name in os.listdir(f'dataset/{account}/likers/'):
        if i == 0:
            ig_bbva_likers = pd.read_csv(f'dataset/{account}/likers/{csv_name}')
        else:
            ig_bbva_new_likers = pd.read_csv(f'dataset/{account}/likers/{csv_name}')
            ig_bbva_likers = ig_bbva_likers.append(ig_bbva_new_likers, ignore_index=True)
        i += 1
ig_bbva_likers = ig_bbva_likers.drop_duplicates('username')
ig_bbva_likers.to_csv('./dataset/bbva_likers.csv')

In [154]:
ig_bbva_comments = pd.read_csv('../../data/instagram/bbva_comments.csv', index_col=[0])
ig_bbva_likers = pd.read_csv('../../data/instagram/bbva_likers.csv', index_col=[0])

In [69]:
len(ig_bbva_comments)

3407

In [73]:
len(ig_bbva_likers)

7323

In [33]:
labels = {
    '__label__0': 'neg',
    '__label__1': 'pos'
}
pd.read_csv('pred.csv')['labels'].map(lambda x: labels[x])

0       pos
1       neg
2       neg
3       pos
4       neg
       ... 
3402    neg
3403    neg
3404    neg
3405    neg
3406    pos
Name: labels, Length: 3407, dtype: object

In [185]:
import re 

with open('test.csv', 'w') as f:
    for comment in pd.read_csv('../../data/instagram/bbva_comments.csv', index_col=[0])['text']:
        f.write(re.sub(r'[^\w\s]', '',comment.replace('\n', '') + '\n'))

In [194]:
preds = pd.read_csv('test.csv')
preds['pred'] = pd.read_csv('pred.csv')['labels'].map(lambda x: labels[x])

In [199]:
preds[preds['pred'] == 'pos']
preds

Unnamed: 0,comment,pred
0,READ THIS YOU WILL BE KISSED ON THE NEAREST PO...,pos
1,Quiero todo eso,neg
2,Muy muy muy mal servicio con un problema en ca...,neg
3,FRAUDE,neg
4,SON BBVA INSEGUROS 3 VECES ME DEJARON PLANTADA...,pos
...,...,...
3284,En vez de subir estas estupideces porque no se...,neg
3285,Desastre de banco pesomo servicio no te puedo...,pos
3286,Despidan a todos los ineptos de sus empleados ...,pos
3287,A mí me cerraron arbitrariamente mí cuenta y n...,neg


In [102]:
label_map = {
    'pos': '__label__1',
    'neg': '__label__0',
}
train_set = pd.DataFrame()
train_set['label'] = preds['pred'].map(lambda x: label_map[x])[:100]
train_set['comment'] = preds['comentario'][:1000]

In [104]:
train_set

Unnamed: 0,label,comment
0,__label__1,READ THIS YOU WILL BE KISSED ON THE NEAREST PO...
1,__label__0,Quiero todo eso
2,__label__0,Muy muy muy mal servicio con un problema en ca...
3,__label__1,FRAUDE
4,__label__0,SON BBVA INSEGUROS 3 VECES ME DEJARON PLANTADA...
...,...,...
95,__label__1,iza_lucro
96,__label__1,charensebi
97,__label__1,mariajosealzugaray
98,__label__1,mariliacaraballo


In [193]:
train_set = pd.read_csv('train.csv', index_col=[0], sep=',')
train_set['comment'] = train_set['comment'].map(lambda x: str(x))
train_set

Unnamed: 0_level_0,comment
label,Unnamed: 1_level_1
__label__0,Cordobés porque me la complicaste con el coseno
__label__1,Se imaginan a los chicos agradeciendo por el p...
__label__0,Tengo fiebre
__label__1,Eclesiastes4912 Siempre promesa httpstcoXbr...
__label__0,sooooyderiver dame bola
...,...
__label__1,Y ahora no podré dormir
__label__0,una crema para la cara me dio alergia en la ca...
__label__1,En la Boca Estadio Boca Juniors httpstco8Qds...
__label__0,Jajaja soy un cagon


In [176]:
clean_comments = []
for comment in train_set['comment']:
    #print(comment)
    comment = comment.replace('\n', '')
    comment = comment.replace('__label__0', '0')
    comment = comment.replace('__label__1', '1')
    clean_comments.append(re.sub(r'[^\w\s]', '',comment))

In [177]:
train_set['comment'] = clean_comments

In [179]:
train_set.to_csv('/home/ivanovsky/git/fastText-0.9.2/data/train.csv')
train_set.to_csv('train.csv')