# Data Collection & Cleaning Code for NFT Market Analysis Project

In [30]:
#Initial imports
import requests
import pandas as pd
import numpy as np
import datetime as dt
import os
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import json
import csv


#Imports required for pulling data using OpenSea API and cleaning the data
from helpers import parse_events_data, parse_assets_data, parse_sale_data, parse_listing_data
from pandas_profiling import ProfileReport
import pickle
from statistics import *
from scipy.stats import combine_pvalues
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
plt.style.use('ggplot')
import time
from datetime import date, timedelta, datetime
import glob


In [31]:
#Load API keys file
load_dotenv("OUR_KEYS.env")

True

In [32]:
#Set Crypto Compare API keys for crypto data
cryptocompare_api_key = os.getenv("CRYPTOCOMPARE_API_KEY")
type(cryptocompare_api_key)

str

In [33]:
#Set Alpaca API keys for US stock market data
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")
type(alpaca_api_key)
type(alpaca_secret_key)

str

## Pull Alpaca data using API, save in Pandas dataframe, and clean the data

In [21]:
#Carl's code to go here

In [None]:
#Save the dataframe with the Alpaca stock data as a csv file
stocks_data.to_csv(path_or_buf='Output_csv_data_files/stocks_data.csv')

## Pull Cryptocompare data using API, save in Pandas dataframe, and clean the data

In [44]:
#Carl's code to go here

In [None]:
#Save the dataframe with the crypto data as a csv file
crypto_data.to_csv(path_or_buf='Output_csv_data_files/crypto_data.csv')

## Import Ethereum gas price data, save in Pandas dataframe, and clean the data

In [None]:
#John's code to go here

In [None]:
#Save the dataframe with the gas price data as a csv file
gas_price_data.to_csv(path_or_buf='Output_csv_data_files/gas_price_data.csv')

## Pull OpenSea data using API, save in Pandas dataframe, and clean the data

This section contains code pulled from Alex Duffy's Github which references base code by Adil Moujahid. \
Blog post located here: https://alxdfy.github.io/2021/09/19/data-mining-OpenSea_markdown.html#get-nft \
GitHub repository located here: https://github.com/AlxDfy/OpenSea_API_DataScience

In [34]:
#Set a variable equal to the smart contract address for the desired NFT collection - in this case we'll be evaluating the collection "Cryptopunks"
asset_contract_address = "0x18Df6C571F6fE9283B87f910E41dc5c8b77b7da5"

In [35]:
#Define function to pull NFT properties data from Opensea

def download_asset_info(save_location):
    if not os.path.isdir(save_location):
        os.makedirs(save_location)
    url = "https://api.opensea.io/api/v1/assets"
    listoassets = []

    for i in range(0, 3000):
        querystring = {"token_ids":list(range((i*30), (i*30)+30)),
                       "asset_contract_address":asset_contract_address,
                       "order_direction":"desc",
                       "offset":"0",
                       "limit":"30"}
        response = requests.request("GET", url, params=querystring)

        print(i, end=" ")
        if response.status_code != 200:
            print('error')
            print(response.json())
            break

        # Getting assets data
        assets = response.json()['assets']
        if assets == []:
            break
        # Parsing assets data
        parsed_assets = [parse_assets_data(asset) for asset in assets]
        # storing parsed events data into list
        listoassets.append(parsed_assets)
    
    # Flatten everything into one list
    listoassets = [item for sublist in listoassets for item in sublist]
    # Convert to df
    assets_df = pd.DataFrame(listoassets)

    #Save data in a new file
    with open(save_location + 'assets_df'+str(date.today())+r'.pkl', 'wb') as f:
        pickle.dump(assets_df, f)
        

In [36]:
#Define function to pull NFT sales data from Opensea
# Download sales info from start_date to end _date and save them all into their own day's files
# Default values from start of OpenSea in 2018 to today

def download_sales_info(save_location, start_date = date(2018,7,1), end_date = date.today()):
    if not os.path.isdir(save_location):
        os.makedirs(save_location)
    url = "https://api.opensea.io/api/v1/events"
    # get the number of days that we want to download and save sales for
    delta = end_date - start_date
    count_days = int(delta.days)
    
    for i in range(count_days+1):
        sales_that_day = []
        # set start and end of the day we are checking, if it's today set end to current time
        if date.today() == (start_date + timedelta(days=i)):
            before = datetime.now()
            after = datetime.combine((start_date + timedelta(days=i)), datetime.min.time())
        else:
            before = datetime.combine((start_date + timedelta(days=i+1)), datetime.min.time())
            after = datetime.combine((start_date + timedelta(days=i)), datetime.min.time())
        # There are too many transactions, now have to break them up by chunks in the day
        hour_chunks = 24
        chunk_count = 24/hour_chunks
        time.sleep(.5)

        for chunk in range(int(chunk_count)):
            end = False
            for j in range(0, 35):
                time.sleep(.5)
                # add the hour_chunk to the start of the day (after) time for each chunk
                # use the actual before if we pass it chronologically though
                changed_before = after + timedelta(hours=hour_chunks*(chunk+1)) - timedelta(minutes = 1)
                changed_after = after + timedelta(hours = hour_chunks*(chunk))
                
                # this should only happen on the last chunk of a split day or if on current day
                if before < changed_before:
                    changed_before = before
                    end = True

                querystring = {"asset_contract_address":asset_contract_address,
                               "event_type":"successful",
                               "only_opensea":"true",
                               "offset":j*300,
                               "occurred_before":changed_before,
                               "occurred_after":changed_after,
                               "limit":"300"}
                headers = {"Accept": "application/json"}

                response = requests.request("GET", url, headers=headers, params=querystring)


                print(j, end=" ")
                if response.status_code != 200:
                    print('error')
                    print(response.json())
                    break

                #Getting assets sales data
                event_sales = response.json()['asset_events']

                if event_sales == []:
                    end =True
                    break

                # Parsing asset sales data
                parsed_event_sales = [parse_sale_data(sale) for sale in event_sales]
                # storing parsed events data into list
                sales_that_day.append(parsed_event_sales)
                # check if the last date in the list is the same day as 
                last_date = (datetime.strptime(parsed_event_sales[0]['timestamp'], '%Y-%m-%dT%H:%M:%S'))
                print(last_date)
            if end:
                break
        sales_that_day = [item for sublist in sales_that_day for item in sublist]
        
        print(str(len(sales_that_day))+ " sales saved to" + save_location + "events_sales_list_" + str(start_date + timedelta(days=i))+'.pkl')
        with open(save_location + "events_sales_list_" + str(start_date + timedelta(days=i))+'.pkl', 'wb') as f:
            pickle.dump(sales_that_day, f)

In [37]:
#Define function to pull NFT listings data from Opensea

# Download listings info from start_date to end _date and save them all into their own day's files
# Default values to the start of OpenSea in 2018 to today
def download_listings_info(save_location, start_date = date(2018,7,1), end_date = date.today()):
    if not os.path.isdir(save_location):
        os.makedirs(save_location)
    url = "https://api.opensea.io/api/v1/events"
    # get the number of days that we want to download and save listings for
    delta = end_date - start_date
    count_days = int(delta.days)
    
    for i in range(count_days+1):
        listings_that_day = []
        # set start and end of the day we are checking, if it's today set end to current time
        if date.today() == (start_date + timedelta(days=i)):
            before = datetime.now()
            after = datetime.combine((start_date + timedelta(days=i)), datetime.min.time())
        else:
            before = datetime.combine((start_date + timedelta(days=i+1)), datetime.min.time())
            after = datetime.combine((start_date + timedelta(days=i)), datetime.min.time())
        # There are too many transactions, now have to break them up by chunks in the day
        hour_chunks = 24
        chunk_count = 24/hour_chunks
        time.sleep(.5)

        
        for chunk in range(int(chunk_count)):
            end = False
            for j in range(0, 35):
                time.sleep(.5)
                # add the hour_chunk to the start of the day (after) time for each chunk
                # use the actual before if we pass it chronologically though
                changed_before = after + timedelta(hours=hour_chunks*(chunk+1)) - timedelta(minutes = 1)
                changed_after = after + timedelta(hours = hour_chunks*(chunk))
                
                # this should only happen on the last chunk of a split day or if on current day
                if before < changed_before:
                    changed_before = before
                    end = True

                querystring = {"asset_contract_address":asset_contract_address,
                               "event_type":"created",
                               "only_opensea":"true",
                               "offset":j*300,
                               "occurred_before":changed_before,
                               "occurred_after":changed_after,
                               "limit":"300"}
                headers = {"Accept": "application/json"}

                response = requests.request("GET", url, headers=headers, params=querystring)


                print(j, end=" ")
                if response.status_code != 200:
                    print('error')
                    print(response.json())
                    break

                #Getting assets listings data
                event_listings = response.json()['asset_events']

                if event_listings == []:
                    end = True
                    break

                # Parsing events listings data
                parsed_event_listings = [parse_listing_data(listing) for listing in event_listings]
                # storing parsed events data into list
                listings_that_day.append(parsed_event_listings)
                # check if the last date in the list is the same day as 
                print(parsed_event_listings[0]['created_date'])
            if end:
                break
        listings_that_day = [item for sublist in listings_that_day for item in sublist]
        
        print(str(len(listings_that_day))+ " listings saved to" + save_location +
              "events_listings_list_" + str(start_date + timedelta(days=i))+'.pkl')
        with open(save_location + "events_listings_list_" + str(start_date + timedelta(days=i))+'.pkl', 'wb') as f:
            pickle.dump(listings_that_day, f)

In [None]:
#Execute the pull requests for the properties and sales data for the NFTs

# Change location to suit your own needs
save_location = "./static/animetas/"

# assets
download_asset_info(save_location)

# SALES
# download sales info from start_date to end_date e.g. date(2018,7,1), date.today() - timedelta(days=1), etc.
# defaults to start of OpenSea in 2018 to today

download_sales_info(save_location = save_location, start_date = date(2018,7,1))

# LISTINGS
# download listings info from start_date to end_date e.g. date(2021, 7, 30), date.today() - timedelta(days=1), etc.
# defaults to first day of listings to today

download_listings_info(save_location = save_location, start_date = date(2018,7,1))

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 27

In [7]:
# load the sales lists, combine them, and turn into a DF
def load_sales_info(save_location):
    files = [filename for filename in os.listdir(save_location) if filename.startswith('events_sales')]
    all_sales = []
    # load all files for sales by day
    for file in files:
        with open(str(save_location) + str(file), 'rb') as f:
            all_sales.append(pickle.load(f))
    
    # flatten into one list
    all_sales = [item for sublist in all_sales for item in sublist]
    # convert to dataframe
    events_sales_df = pd.DataFrame(all_sales)
    
    return events_sales_df

# load the listing lists, combine them, and turn into a DF
def load_listings_info(save_location):
    files = [filename for filename in os.listdir(save_location) if filename.startswith('events_listing')]
    all_listings = []
    # load all files for listings by day
    for file in files:
        with open(str(save_location) + str(file), 'rb') as f:
            all_listings.append(pickle.load(f))
    
    # flatten into one list
    all_listings = [item for sublist in all_listings for item in sublist]
    # convert to dataframe
    events_listings_df = pd.DataFrame(all_listings)
    
    return events_listings_df

# load most recent saved assets df
def load_assets_info(save_location):
    files = glob.glob(str(save_location)+'assets_df????-??-??.pkl')
    with open(max(files, key=os.path.getctime), 'rb') as f:
        return pickle.load(f)

In [8]:
#load all our saved files

#SALES
events_sales_df = load_sales_info(save_location)
# Pre-processing
# Convert price from WEI to ETH & for now get rid of bundles and duplicates(?)
events_sales_df = events_sales_df[(events_sales_df['payment_token'] != 'USDC') & (events_sales_df['is_bundle'] == False)].copy()
events_sales_df = events_sales_df.loc[events_sales_df.astype(str).drop_duplicates().index]
events_sales_df['total_price'] = events_sales_df['total_price']/10.**18
# Change timestamp to datetime
events_sales_df['timestamp'] = pd.to_datetime(events_sales_df['timestamp'])
# Calculating the sale prices in USD
events_sales_df['total_price_usd'] = events_sales_df['total_price'] * events_sales_df['usd_price']


#LISTINGS
events_listings_df = load_listings_info(save_location)
# Pre-processing
# Convert price from WEI to ETH & for now get rid of bundles and duplicates(?)
events_listings_df = events_listings_df[(events_listings_df['payment_token'] != 'USDC') & (events_listings_df['is_bundle'] == False)].copy()
events_listings_df = events_listings_df.loc[events_listings_df.astype(str).drop_duplicates().index]
events_listings_df['starting_price'] = events_listings_df['starting_price']/10.**18
# Change timestamp to datetime
events_listings_df['created_date'] = pd.to_datetime(events_listings_df['created_date'])
# Calculating the sale prices in USD
events_listings_df['total_price_usd'] = events_listings_df['starting_price'] * events_listings_df['usd_price']


#ASSETS
assets_df = load_assets_info(save_location)

NameError: name 'save_location' is not defined

In [9]:
#Save the dataframes with the assets, sales, and listings as a csv files
assets_df.to_csv(path_or_buf='Output_csv_data_files/nft_assets_data.csv')
events_sales_df.to_csv(path_or_buf='Output_csv_data_files/nft_sales_data.csv')
events_listings_df.to_csv(path_or_buf='Output_csv_data_files/nft_listings_data.csv')

NameError: name 'assets_df' is not defined

In [12]:
#Create list of top 100 NFT collections in OpenSea
collections_list = [
    "cryptopunks",
    "boredapeyachtclub",
    "mutantapeyachtclub",---
    "edificebybenkovach",---
    "thesandbox",
    "cosmiclabs",
    "parallelalpha",
    "divineanarchy",
    "corruption(s*)",---
    "artwars|aw",---
    "neotokyoidentities",---
    "cryptoadzbygremplin",
    "coolcatsnft",
    "neotokyopart2vaultcards",---
    "decentraland",
    "doodles-official",
    "mekaverse",
    "neotokyopart3itemcaches",---
    "angryapesunited",
    "cyberkongz",
    "thedogepound",---
    "therealgoatsociety",
    "sipherianflash",
    "desperateapewives",---
    "playboyrabbitarsofficial",---
    "theshiboshis",
    "partyape|billionairesclub",---
    "boredapechemistryclub",---
    "treeverse",
    "veefriends",
    "fidenzabytylerhobbs"---
    "meebits",
    "emblemvault[ethereum]",---
    "bearxlabs",
    "smilesssvrs)",
    "fatapeclub",---
    "junglefreaksbytrosley",
    "boredapekennelclub",---
    "punkscomic",---
    "bearsdeluxe",---
    "kaijukingz",
    "chainrunners",
    "ringersbydmitricherniak",---
    "cryptomories",
    "lostpoets",
    "voxcollectibles",---
    "spookyboyscountryclub|byholyghost",---
    "zedrun",---
    "lazylions",
    "chromiesquigglebysnowfro",---
    "0n1force",---
    "mutantcats",
    "10ktfstockrom",---
    "bossbeauties",
    "worldofwomen",
    "thorguards",
    "eponymbyartai",---
    "namewee4896collection",
    "superfarmgenesisseries",---
    "cryptovoxels",
    "knownorigin",---
    "flufworld",---
    "ens",
    "hor1zontroopers",---
    "artblocksfactory",---
    "boonjiproject",
    "superrare",
    "creatureworldnft",---
    "supducks",
    "elliotradesnftcollection"---
    "themetakey",---
    "digitalobjectsartwork",---
    "twinflames",
    "apeharmonymonsterclub",
    "loot",
    "quantum",
    "headdao",
    "thecurrency",
    "pudgypenguins",
    "dogesoundclubmates",---
    "mastershaartist",---
    "impacttheoryfounder'skey",---
    "dogsofelon",---
    "fewociousxrtfkt",---
    "furballs",
    "rtfktxjeffstaple",---
    "adambombsquad",---
    "cyberkongzvx",---
    "non-fungiblefungimintpass",---
    "officialwrappedmooncats(acclimated)",---
    "mycuriocards",---
    "mirandus",
    "rarible",
    "galaxyfightclub",---
    "thefungiblebypak",---
    "galacticapes",
    "metaherouniverse",
    "rtfktbonusitems",---
    "sneakyvampiresyndicate",---
    "2+2genesis"---
]

SyntaxError: invalid syntax (Temp/ipykernel_17844/4043542140.py, line 103)

In [None]:
#Create OpenSea API URL
opensea_url = f"https://api.opensea.io/api/v1/collection/{collections}/stats"
headers = {"Accept": "application/json"}

#Create function to request data using API
def get_data_by_collection(collections):
    opensea_url = f"https://api.opensea.io/api/v1/collection/{collections}/stats"
    opensea_response_data = requests.request("GET", opensea_url, headers=headers).json()
    return json.dumps([opensea_response_data], indent=4)

#Loop through the list of NFT collections to pull data for each collection
collection_data_list = []
for c in collections_list:
    collections_data = get_data_by_collection(c)
    collection_data_list.append(collections_data)

#Write the data to a csv file
with open("collection_data_list", 'w') as l:
    write = csv.writer(l)
    write.writerow(collection_data_list)