## Project Requirements:
Scrape profiles and get:
1. the number of posts
2. the follower count
3. number of likes on most liked post
4. numbner of likes

## Follow Up
Save each profile in its own data frame with each row corresponding to a post <br>
export as a excel file with each profile as a worksheet <br>
find a way to edit so that the sheet includes some global account variables (ex. number of followers) outside the dataframe

In [1]:
from instagramy import InstagramUser
from instascrape import Profile, scrape_posts, Post
from selenium.webdriver import Chrome
from datetime import datetime
import pandas as pd
import numpy as np

In [3]:
chrome_driver_path = "/Users/chena23/Desktop/InstragramScraper/chromedriver"
driver = Chrome(chrome_driver_path)


In [4]:
headers = {
    "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36 Edg/87.0.664.57",
    "cookie": "sessionid=7320119797%3Ae0j0OTolSUydwP%3A10"
}

In [5]:
def get_top_post (scraped_posts, cut_off_date = datetime(2021, 8,1)):
    # Checking if an empty array was inputed
    if (type(scraped_posts) != list):
        raise TypeError("Input is not a list")
    if (len(scraped_posts) == 0):
        raise ValueError("Empty posts list imported")
    
    #Returning Post with the highest number of likes
    top_post = scraped_posts[0]
    
    for post in scraped_posts:
        if (type(post) != Post):
            raise TypeError("One or more objects in the list is not a Post object")
        if (datetime.fromtimestamp(post.timestamp) < cut_off_date):
            if (post.likes > top_post.likes):
                top_post = post
        else:
            print("Post posted on {}, after the deadline".format(datetime.fromtimestamp(post.timestamp)))

    return top_post
        
    


In [6]:
def check_valid_username (users):
    for i in range (0, len(users)):
        if (not pd.isnull(users.iloc[i]['IG Username'])):
            try:
                print(users.iloc[i]["IG Username"])
                profile = Profile(users.iloc[i]['IG Username'])
            except:
                print("{} is not a valid user name, the username is located at index {}, its name is {}".format(users.iloc[i]["IG Username"], i, users.iloc[i]['School']))

In [7]:
def create_post_df (scraped_posts):
    #Create Data Frame with collumns for post attributes
    df = pd.DataFrame(columns=['post_url', 'date_posted', 'number_of_likes', 'number_of_comments'])

    #Declaring types for each collumn
    df['post_url'] = df['post_url'].astype(object)
    df['date_posted'] = df['date_posted'].astype('datetime64[ns]')
    df['number_of_likes'] = df['number_of_likes'].astype('Int64')
    df['number_of_comments'] = df['number_of_comments'].astype('Int64')
    
    for i in range(0, len(scraped_posts)):
        cur_post = scraped_posts[i]
        df.at[i, 'post_url'] = "instagram.com/p/{}".format(cur_post.shortcode)
        df.at[i, 'date_posted'] = datetime.fromtimestamp(cur_post.timestamp)
        df.at[i, 'number_of_likes'] = cur_post.likes
        df.at[i, 'number_of_comments'] = cur_post.comments
    
    #return filled dataframe
    return df


In [8]:
users = pd.read_csv("Social_Media_Study_Handles.csv")
users = users.reindex(columns=[*users.columns, 'follower_count', 'number_of_posts', 'top_post_likes', 
'top_post_date_posted', 'top_post_url', 'first_post_likes', 'first_post_date_posted', 'first_post_url'])

#Specifying datatype for each columns
users['School'] = users['School'].astype(object)
users['IG Username'] = users['IG Username'].astype(object)
users['follower_count'] = users['follower_count'].astype('Int64')
users['number_of_posts'] = users['number_of_posts'].astype('Int64')
users['top_post_likes'] = users['top_post_likes'].astype('Int64')
users['top_post_date_posted'] = users['top_post_date_posted'].astype('datetime64[ns]')
users['top_post_url'] = users['top_post_url'].astype(object)
users['first_post_likes'] = users['first_post_likes'].astype('Int64')
users['first_post_date_posted'] = users['first_post_date_posted'].astype('datetime64[ns]')
users['first_post_url'] = users['first_post_url'].astype(object)

In [33]:
users[users['IG Username'].isna()]

Unnamed: 0,School,IG Username,follower_count,number_of_posts,top_post_likes,top_post_date_posted,top_post_url,first_post_likes,first_post_date_posted,first_post_url
83,National Capital Consortium,,,,,NaT,,,NaT,
89,Larkin,,,,,NaT,,,NaT,
95,Icahn School of Medicine at Mount Sinai,,,,,NaT,,,NaT,
100,Howard,,,,,NaT,,,NaT,
104,University of South Carolina,,,,,NaT,,,NaT,
105,San Antonio Uniformed Services Health Educatio...,,,,,NaT,,,NaT,
108,BronxCare Health System,,,,,NaT,,,NaT,
110,University of Arizona - Tucson (South Campus),,,,,NaT,,,NaT,
112,University of Puerto Rico,,,,,NaT,,,NaT,


In [35]:
print(not pd.isnull(users.iloc[89]['IG Username']))

False


In [36]:
users.head(5).append(users.iloc[89])

Unnamed: 0,School,IG Username,follower_count,number_of_posts,top_post_likes,top_post_date_posted,top_post_url,first_post_likes,first_post_date_posted,first_post_url
0,TJU/Wills Eye Hospital,willseyeresidents,,,,NaT,,,NaT,
1,Bascom Palmer/University of Miami,bascompalmereye,,,,NaT,,,NaT,
2,Johns Hopkins/Wilmer,wilmereyeresidents,,,,NaT,,,NaT,
3,University of Iowa,uiowaeye,,,,NaT,,,NaT,
4,Mass. Eye and Ear,harvardophthalmologyresidents,,,,NaT,,,NaT,
89,Larkin,,,,,NaT,,,NaT,


In [40]:
#test purposes
users = users.head(5).append(users.iloc[89])

for i in range (0, len(users)):
    if (not pd.isnull(users.iloc[i]['IG Username'])):
        #Scrape the profile
        print(users.iloc[i]['IG Username'])
        profile = Profile(users.iloc[i]['IG Username'])
        profile.scrape(headers=headers)

        #Adding profile datapoints to dataframe
        users.at[i, 'follower_count'] = profile.followers
        users.at[i, 'number_of_posts'] = profile.posts

        #Scraping Posts
        posts = profile.get_posts(webdriver=driver)
        scraped_posts, unscraped = scrape_posts(posts[1:10], webdriver=driver, silent=False, headers=headers, pause=10)
        top_post = get_top_post(scraped_posts)

        #Post are scraped from most recent to oldest, therefore, the earliest post will be the last one scraped in theory
        first_post = scraped_posts[-1]

        users.at[i, 'top_post_likes'] = top_post.likes
        users.at[i, 'top_post_date_posted'] = datetime.fromtimestamp(top_post.timestamp)
        users.at[i, 'top_post_url'] = "instagram.com/p/{}".format(top_post.shortcode)
        users.at[i, 'first_post_likes'] = first_post.likes
        users.at[i, 'first_post_date_posted'] = datetime.fromtimestamp(first_post.timestamp)
        users.at[i, 'first_post_url'] = "instagram.com/p/{}".format(first_post.shortcode)

        

willseyeresidents
0: CC6_LwDhnTu - 2020-07-21 18:56:19
1: CC4Kk8ZBGIc - 2020-07-20 16:38:09
2: CCtc94uh_SW - 2020-07-16 12:47:12
3: CCqrp49B5j1 - 2020-07-15 10:57:49
4: CCjOYsGBwWO - 2020-07-12 13:27:23
5: CCecas4hdY- - 2020-07-10 16:53:47
6: CCHSdAehmf7 - 2020-07-01 17:04:11
7: CB3xPaehw43 - 2020-06-25 16:25:22
8: CBy5lzbBlq8 - 2020-06-23 19:02:08
bascompalmereye
0: CUvbUicr4PE - 2021-10-07 16:36:23
1: CUs_jaeA_8y - 2021-10-06 17:55:16
2: CUaZsnrgg9F - 2021-09-29 12:38:09
3: CUVhQdtNU8A - 2021-09-27 15:08:01
4: CUFnunzAZwv - 2021-09-21 11:00:13
5: CT9fDQ9AcnG - 2021-09-18 07:09:35
6: CTxNPbhgE8k - 2021-09-13 12:40:27
7: CTXlhvBrOZw - 2021-09-03 13:52:24
8: CTJCCrZgrnc - 2021-08-28 22:18:17
Post posted on 2021-10-07 16:36:23, after the deadline
Post posted on 2021-10-06 17:55:16, after the deadline
Post posted on 2021-09-29 12:38:09, after the deadline
Post posted on 2021-09-27 15:08:01, after the deadline
Post posted on 2021-09-21 11:00:13, after the deadline
Post posted on 2021-09-18

In [38]:
users

Unnamed: 0,School,IG Username,follower_count,number_of_posts,top_post_likes,top_post_date_posted,top_post_url,first_post_likes,first_post_date_posted,first_post_url
0,TJU/Wills Eye Hospital,willseyeresidents,1977.0,33.0,141.0,2020-07-20 16:38:09,instagram.com/p/CC4Kk8ZBGIc,57.0,2020-06-23 19:02:08,instagram.com/p/CBy5lzbBlq8
1,Bascom Palmer/University of Miami,bascompalmereye,12454.0,239.0,363.0,2021-10-07 16:36:23,instagram.com/p/CUvbUicr4PE,120.0,2021-08-28 22:18:17,instagram.com/p/CTJCCrZgrnc
2,Johns Hopkins/Wilmer,wilmereyeresidents,1511.0,33.0,179.0,2021-02-02 15:33:59,instagram.com/p/CKzaswbhAF7,107.0,2021-01-17 12:48:47,instagram.com/p/CKJ7EzWh1yB
3,University of Iowa,uiowaeye,1470.0,131.0,,NaT,,,NaT,
4,Mass. Eye and Ear,harvardophthalmologyresidents,,,,NaT,,,NaT,
89,Larkin,,,,,NaT,,,NaT,


In [9]:
test_profile = Profile("willseyeresidents")
test_profile.scrape(headers=headers)

In [10]:
test_profile.posts

33

In [11]:
posts = test_profile.get_posts(webdriver=driver, login_first=True, login_pause=20)

In [None]:
len(posts)

33

In [12]:
scraped, unscraped = scrape_posts(posts, webdriver=driver, silent=False, headers=headers, pause=10)

0: CDABF2hhEhP - 2020-07-23 17:49:11
1: CC6_LwDhnTu - 2020-07-21 18:56:19
2: CC4Kk8ZBGIc - 2020-07-20 16:38:09
3: CCtc94uh_SW - 2020-07-16 12:47:12
4: CCqrp49B5j1 - 2020-07-15 10:57:49
5: CCjOYsGBwWO - 2020-07-12 13:27:23
6: CCecas4hdY- - 2020-07-10 16:53:47
7: CCHSdAehmf7 - 2020-07-01 17:04:11
8: CB3xPaehw43 - 2020-06-25 16:25:22
9: CBy5lzbBlq8 - 2020-06-23 19:02:08
10: CBwQp8qhq3y - 2020-06-22 18:25:57
11: CBtTXidBM4k - 2020-06-21 14:51:56
12: CBqW91ohOfD - 2020-06-20 11:25:39
13: CBoCRLXhQex - 2020-06-19 13:46:18
14: CBlvolqAn4t - 2020-06-18 16:25:00
15: CBjLzmHBJxE - 2020-06-17 16:33:27
16: CANgGIPAQc0 - 2020-05-15 09:56:03
17: CALFcpLAZq1 - 2020-05-14 11:24:43
18: B_lFRQnBz3j - 2020-04-29 17:12:01
19: B-sAg1yhXLQ - 2020-04-07 13:13:45
20: B9Z5wFmBtvD - 2020-03-06 14:56:56
21: B9HGomcBRkn - 2020-02-28 07:43:57
22: B8nbgg2hDsc - 2020-02-16 00:30:39
23: B7yyPaVB9wF - 2020-01-26 13:49:32
24: B6UcX8CB3J_ - 2019-12-20 22:29:51
25: B6TRZBrBHoW - 2019-12-20 11:34:38
26: B5l3vIoBB4d - 2019

In [13]:
test_df = create_post_df(scraped)
test_df

Unnamed: 0,post_url,date_posted,number_of_likes,number_of_comments
0,instagram.com/p/CDABF2hhEhP,2020-07-23 17:49:11,114,3
1,instagram.com/p/CC6_LwDhnTu,2020-07-21 18:56:19,95,3
2,instagram.com/p/CC4Kk8ZBGIc,2020-07-20 16:38:09,141,7
3,instagram.com/p/CCtc94uh_SW,2020-07-16 12:47:12,76,4
4,instagram.com/p/CCqrp49B5j1,2020-07-15 10:57:49,77,0
5,instagram.com/p/CCjOYsGBwWO,2020-07-12 13:27:23,51,1
6,instagram.com/p/CCecas4hdY-,2020-07-10 16:53:47,62,0
7,instagram.com/p/CCHSdAehmf7,2020-07-01 17:04:11,49,0
8,instagram.com/p/CB3xPaehw43,2020-06-25 16:25:22,65,4
9,instagram.com/p/CBy5lzbBlq8,2020-06-23 19:02:08,57,1


In [None]:
len(scraped)

33

In [None]:
top_test_post = get_top_post(scraped)
print(top_test_post.likes, "instagram.com/p/{}".format(top_test_post.shortcode))

141 instagram.com/p/CC4Kk8ZBGIc
