In [1]:
# import required libraries
import requests as rq
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

In [7]:
import sys
import subprocess
#sys.executable
subprocess.check_call([sys.executable, "-m", "pip", "install", "category_encoders"])

0

In [11]:
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "category_encoders", "scikit-learn"])

0

In [8]:
pip show category_encoders

Name: category_encoders
Version: 2.8.1
Summary: A package for encoding categorical variables for machine learning
Home-page: 
Author: PaulWestenthanner
Author-email: paul@westenthanner.dev
License: BSD-3
Location: C:\Users\Arfan Shah\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: numpy, pandas, patsy, scikit-learn, scipy, statsmodels
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [2]:
# url of API
url = 'https://api.coingecko.com/api/v3/coins/markets'

# create an empty list to store data in loop
all_data = []

# iterate over a loop and get more data
for page in range(1, 50):

    # define parameters
    params = {
        'vs_currency': 'usd',
        'order': 'market_cap_desc',
        'per_page': 250,
        'page': page,
        'sparkline': False
    }

    # make a connection to API
    api_response = rq.get(url, params = params)

    # check the response
    if api_response.status_code == 200:
        # get data in json format
        data = api_response.json()

        # check if data is not present just break
        if not data:
            print(f"Data not found in {page}")
            break
        # data addition to list
        all_data.extend(data)
        time.sleep(1.5)
    elif api_response.status_code == 429:
        print(f"Rate limit hit on page {page}. waiting 60 seconds before retrying...")
        time.sleep(60)
        continue
    else:
        print(f"Failed to fetch page {page}: {api_response.status_code}")
        break

# pandas dataframe
dataset = pd.DataFrame(all_data)

# extract the needy data
dataset = dataset[['id', 'symbol', 'current_price', 'market_cap', 'total_volume']]

print(dataset)
# shape
print("Dataset dimensions: ", dataset.shape)

Rate limit hit on page 6. waiting 60 seconds before retrying...
Rate limit hit on page 9. waiting 60 seconds before retrying...
Rate limit hit on page 14. waiting 60 seconds before retrying...
Rate limit hit on page 19. waiting 60 seconds before retrying...
Rate limit hit on page 25. waiting 60 seconds before retrying...
Rate limit hit on page 32. waiting 60 seconds before retrying...
Rate limit hit on page 37. waiting 60 seconds before retrying...
Rate limit hit on page 41. waiting 60 seconds before retrying...
Rate limit hit on page 46. waiting 60 seconds before retrying...
               id      symbol  current_price    market_cap  total_volume
0         bitcoin         btc   95531.000000  1.897194e+12  1.348921e+10
1        ethereum         eth    1831.890000  2.211723e+11  7.557637e+09
2          tether        usdt       1.000000  1.493621e+11  1.214679e+10
3          ripple         xrp       2.190000  1.281351e+11  1.305868e+09
4     binancecoin         bnb     593.260000  8.6545

In [3]:
# display few rows
dataset.head()

Unnamed: 0,id,symbol,current_price,market_cap,total_volume
0,bitcoin,btc,95498.0,1896525000000.0,13648870000.0
1,ethereum,eth,1832.45,221232500000.0,7458328000.0
2,tether,usdt,1.0,149356600000.0,13739990000.0
3,ripple,xrp,2.19,128121800000.0,1146658000.0
4,binancecoin,bnb,595.19,86829960000.0,377955000.0


In [4]:
# find shape of final_dataset
dataset.shape

(10250, 5)

In [5]:
# find any null values
dataset.isnull().sum()

id               0
symbol           0
current_price    2
market_cap       0
total_volume     0
dtype: int64

In [6]:
# find type of dataset
type(dataset)

pandas.core.frame.DataFrame

In [7]:
# we have just two null values, so i need to remove those columns as they are very few
dataset = dataset[dataset['current_price'].notna()]

In [8]:
# now check the null values
dataset.isnull().sum()

id               0
symbol           0
current_price    0
market_cap       0
total_volume     0
dtype: int64

In [9]:
# now find dataset's shape
dataset.shape

(10248, 5)

In [10]:
# check data types in dataset
dataset.dtypes

id                object
symbol            object
current_price    float64
market_cap       float64
total_volume     float64
dtype: object

In [11]:
# we have two objects lets find unique values in each column
dataset.id.nunique()

9817

In [12]:
# check unique values in second column
dataset.symbol.nunique()

8090

In [13]:
# columns
dataset.columns

Index(['id', 'symbol', 'current_price', 'market_cap', 'total_volume'], dtype='object')

In [19]:
# apply encoding to deal with the high number of categories in id and symbol
# Define features and target (e.g., predicting current_price)
x = dataset[['id', 'symbol', 'market_cap', 'current_price']]
y = dataset['total_volume']

y.head()

0    1.364887e+10
1    7.458328e+09
2    1.373999e+10
3    1.146658e+09
4    3.779550e+08
Name: total_volume, dtype: float64

In [22]:
# Split data to avoid leakage
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Apply target encoding
encoder = TargetEncoder(cols=['id', 'symbol'])
x_train_encoded = encoder.fit_transform(x_train, y_train)
x_test_encoded = encoder.transform(x_test)
x_train_encoded.head()

NameError: name 'TargetEncoder' is not defined