# Data Cleaning

## REKT Database from DEFIYIELD

In [308]:
import pandas as pd             #pandas for using dataframe and reading csv file(s)
import json
import numpy as np              #numpy for vector operations and basic maths
import matplotlib.pyplot as plt #for plotting
%matplotlib inline              
import seaborn as sns           #for making plots
import math                     #for basic math operations
import warnings
# from pandas.plotting import parallel_coordinates #for multivariate plots
warnings.filterwarnings('ignore') #ignore deprecation warnings

In [309]:
#importing data

open_REKTjson = open('../../data/Raw Data/Python_REKT_Database_API/REKT_Database_Python_API.json')
load_REKTjson = json.load(open_REKTjson)
REKT_df = pd.DataFrame(load_REKTjson)

REKT_df.head(5) # visualize first 5 instances of raw data

Unnamed: 0,id,project_name,description,name_categories,token_name,proof_archive_link,technical_issue,token_address,token_addresses,logo_link,...,discord,bug_bounty_program_link,bug_bounty_program_company,audit_code_conf,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy
0,3058,Terra Classic,<p><strong>Quick Summary</strong></p><p>A comp...,Stablecoin,"LUNC, USTC",https://twitter.com/OnChainWizard/status/15241...,,"0x7e43d25EaD96B1058f671F6690ea705BA2C7e5B9, 0x...",[],safe/files/scamDatabase/logo/62b31ccc0d07e.jpeg,...,https://twitter.com/terra_money,,,0.0,0.0,1.0,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 1003, 'scam_database_id': 305...",[{'audit_link': 'safe/files/audit/pdf/CertiK_A...
1,2762,Africrypt,<p><strong>Quick Summary</strong></p><p>Ameer ...,CeFi,,,,,[],safe/files/scamDatabase/logo/61e049352a11a.png,...,,,,,0.0,1.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]
2,2878,PlusToken,<p><strong>Quick Summary</strong></p><p>The pe...,CeFi,,,,,[],safe/files/scamDatabase/logo/62432b9d39213.png,...,,,,,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]
3,2861,Thodex,<p><strong>Quick Summary</strong></p><p>Thodex...,CeFi,,,,,[],safe/files/scamDatabase/logo/6241c6cb5dadc.jpeg,...,,,,,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]
4,2735,BitConnect,<p><strong>Quick Summary</strong></p><p>BitCon...,"Borrowing and Lending,CeFi",BCC,,,,[],safe/files/scamDatabase/logo/61bb58e866760.jpeg,...,,,,,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]


In [310]:
REKT_df.tail(5) # visualize last 5 instances of raw data

Unnamed: 0,id,project_name,description,name_categories,token_name,proof_archive_link,technical_issue,token_address,token_addresses,logo_link,...,discord,bug_bounty_program_link,bug_bounty_program_company,audit_code_conf,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy
3071,78,BoobsFinance,,,BOOBS,,,0xf98f73350d083005079b6c3cda9c99cfdb668be7,[],safe/files/scamDatabase/logo/60d50974c47c8.png,...,,,,,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 78, '...",[]
3072,77,Chad Token,<p>The contract owner could disable the transf...,Token,CHAD,,,0x2f7383de70c972ef8c18565a9da023b08c110f2c,[],safe/files/scamDatabase/logo/60d508fa757bf.png,...,,,,,,,"{'id': 7, 'type': 'Honeypot'}",{},"[{'networks_id': 56, 'scam_database_id': 77, '...",[]
3073,76,Generate Finance,"<p><span data-sheets-userformat='{""2"":15165,""3...",,GEN,,,0x04ad13a645748cee762f11e43386fe2a275885b4,[],safe/files/scamDatabase/logo/60d50819f1be1.jpeg,...,,,,,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 76, '...",[]
3074,72,Pill Finance,"<p><span data-sheets-userformat='{""2"":15165,""3...",,RED-P,,,0x137faad0d13813ef8d4cbbb336f0e01066b2c9b4,[],safe/files/scamDatabase/logo/60d5003a01e5a.jpeg,...,,,,,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 72, '...",[]
3075,55,SoftdrinkSwap,<p>The team abandoned the project. The team st...,,COLA SODA,,,0x740426e58a10ac3b6724f753c8468288248137f9,[],safe/files/scamDatabase/logo/60df352021e17.png,...,,,,,,,"{'id': 5, 'type': 'Abandoned'}",{},"[{'networks_id': 56, 'scam_database_id': 55, '...",[]


In [311]:
#finding out the shape of the data using "shape" variable: Output (rows, columns)
REKT_df.shape

(3076, 35)

In [312]:
#Printing all the columns present in data
REKT_df.columns

Index(['id', 'project_name', 'description', 'name_categories', 'token_name',
       'proof_archive_link', 'technical_issue', 'token_address',
       'token_addresses', 'logo_link', 'date', 'scam_updates', 'proof_link',
       'website_link', 'webarchive_link', 'twitter_link', 'telegram_link',
       'our_post_link', 'funds_lost', 'funds_returned', 'funds_by_chains',
       'funds_recovered', 'active', 'git_hub', 'git_hub_contract_link',
       'discord', 'bug_bounty_program_link', 'bug_bounty_program_company',
       'audit_code_conf', 'is_verified_source_code', 'is_public_team',
       'scam_type', 'network', 'scamNetworks', 'auditedBy'],
      dtype='object')

In [313]:
# Exporting first 5 rows of Uncleaned DF to HTML for visualization in portfolio

from IPython.display import HTML

html = REKT_df.head(5).to_html()
text_file = open("Raw_REKT_DF.html", "w")
text_file.write(html)
text_file.close()

In [314]:
#Checking for NaN values present in data
REKT_df.isna().sum() # also REKT_df.isnull().sum()

id                               0
project_name                     0
description                    285
name_categories                769
token_name                     199
proof_archive_link            2817
technical_issue               3072
token_address                  214
token_addresses                  0
logo_link                        0
date                          1876
scam_updates                     0
proof_link                     191
website_link                  2239
webarchive_link               2849
twitter_link                  2535
telegram_link                 2646
our_post_link                 3036
funds_lost                       0
funds_returned                   0
funds_by_chains                  0
funds_recovered               3074
active                           0
git_hub                       2987
git_hub_contract_link         3050
discord                       2936
bug_bounty_program_link       3074
bug_bounty_program_company    3075
audit_code_conf     

In [315]:
# Removing spaces at beginning and at the end, if any, from column names
REKT_df.columns = REKT_df.columns.str.strip()

## Dropping Variables That Do Not Assist in Analysis

Most of the variables that we shall drop from our 35 columns are links to external sites, such as web archive, discord, and github. Moreover, these variables contain more than 2900 NaN values out of a total observation count of 3076. However, the variable proof_link could be important if I decide to scrape text data from the linked article about a crypto attack. We shall also get rid of the technical_issue field because it only contains 4 non-NaN values and, more importantly, does not have any insightful use. Therefore, I find it sensible to entirely remove these fields, instead of getting rid of their existing NaN values, for EDA and modeling purposes.

In [316]:
REKT_df.drop(columns={'id','technical_issue', 'proof_archive_link', 'logo_link', 'website_link', 'twitter_link', 'our_post_link', 'telegram_link', 'git_hub', 'git_hub_contract_link', 'discord', 'bug_bounty_program_link', 'bug_bounty_program_company', 'audit_code_conf', 'funds_recovered', 'funds_by_chains', 'scam_updates'}, inplace=True)

REKT_df.shape

(3076, 18)

In [317]:
REKT_df.columns

Index(['project_name', 'description', 'name_categories', 'token_name',
       'token_address', 'token_addresses', 'date', 'proof_link',
       'webarchive_link', 'funds_lost', 'funds_returned', 'active',
       'is_verified_source_code', 'is_public_team', 'scam_type', 'network',
       'scamNetworks', 'auditedBy'],
      dtype='object')

## Variable Identification and Typecasting

In [318]:
# A closer look at the data types present in the data
REKT_df.dtypes

project_name                object
description                 object
name_categories             object
token_name                  object
token_address               object
token_addresses             object
date                        object
proof_link                  object
webarchive_link             object
funds_lost                   int64
funds_returned               int64
active                       int64
is_verified_source_code    float64
is_public_team             float64
scam_type                   object
network                     object
scamNetworks                object
auditedBy                   object
dtype: object

There are a lot of variables visible at once, so let's narrow this down by looking **at one datatype at once**. We will start with **int64** data type.


### Integer Data Type

In [319]:
#Identifying variables with integer datatype
REKT_df.dtypes[REKT_df.dtypes == 'int64']

funds_lost        int64
funds_returned    int64
active            int64
dtype: object

In [320]:
REKT_df['active'].value_counts()

1    3076
Name: active, dtype: int64

Summary:

*    **id** is a unique, nominal code indicating the token/coin associated with the crypto attack. Converting it to category type would not be beneficial due to the large number of unique tokens present in the database. **This variable should be converted to object/string type**. 

*    **active** most probably represents whether the crypto project is currently active in the market. I perused the API documentation to try to find this response variable's significance, but could not. Moreover, it has only taken on one as a value for ALL observations. This would mean that all crypto projects present in the database are still active. We could keep this variable for now, but **converting it to category would be better as it most likely would take on two values, either 1 (active) or 0 (inactive).**  

In [321]:
#Converting active to category datatype
REKT_df['active'] = REKT_df['active'].astype('category')

REKT_df.dtypes

project_name                 object
description                  object
name_categories              object
token_name                   object
token_address                object
token_addresses              object
date                         object
proof_link                   object
webarchive_link              object
funds_lost                    int64
funds_returned                int64
active                     category
is_verified_source_code     float64
is_public_team              float64
scam_type                    object
network                      object
scamNetworks                 object
auditedBy                    object
dtype: object

### Object Data Type

In [322]:
#Identifying variables with object datatype
REKT_df.dtypes[REKT_df.dtypes == 'object']

project_name       object
description        object
name_categories    object
token_name         object
token_address      object
token_addresses    object
date               object
proof_link         object
webarchive_link    object
scam_type          object
network            object
scamNetworks       object
auditedBy          object
dtype: object

*    **Variables like 'date', 'funds_lost', and 'funds_returned' are of type object**. This means that **Pandas was not able to recognise the datatype** of these four variables. Therefore, we shall convert these aforementioned object data type variables to their respective datatype

In [323]:
REKT_df['funds_lost'].head(10)

0    40000000000
1     3600000000
2     2900000000
3     2000000000
4     2000000000
5     1000000000
6      625000000
7      602189570
8      534000000
9      473000000
Name: funds_lost, dtype: int64

In [324]:
REKT_df['funds_returned'].head(10)

0            0
1            0
2            0
3            0
4     56000000
5            0
6    155800000
7    602189570
8    534000000
9            0
Name: funds_returned, dtype: int64

In [325]:
#Converting the object data type variables to their respective datatype
REKT_df['funds_lost'] = REKT_df['funds_lost'].astype('float64')
REKT_df['funds_returned'] = REKT_df['funds_returned'].astype('float64')
REKT_df['date'] = pd.to_datetime(REKT_df['date'])

*    We will further investigate the datatime datatype and extract more information from it.

In [326]:
REKT_df.dtypes

project_name                       object
description                        object
name_categories                    object
token_name                         object
token_address                      object
token_addresses                    object
date                       datetime64[ns]
proof_link                         object
webarchive_link                    object
funds_lost                        float64
funds_returned                    float64
active                           category
is_verified_source_code           float64
is_public_team                    float64
scam_type                          object
network                            object
scamNetworks                       object
auditedBy                          object
dtype: object

### Float Data Type

In [327]:
# Identifying variables with float datatype
REKT_df.dtypes[REKT_df.dtypes == 'float64']

funds_lost                 float64
funds_returned             float64
is_verified_source_code    float64
is_public_team             float64
dtype: object

In [328]:
REKT_df['is_verified_source_code'].head(10)

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: is_verified_source_code, dtype: float64

In [329]:
REKT_df['is_public_team'].head(10)

0    1.0
1    1.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: is_public_team, dtype: float64

*    ***The funds variables should be float types. However, the is_verified_source_code and is_public_team variables take on the values of either 0 or 1. Hence, we convert them to category type.***

In [330]:
REKT_df['is_verified_source_code'] = REKT_df['is_verified_source_code'].astype('category')
REKT_df['is_public_team'] = REKT_df['is_public_team'].astype('category')

REKT_df.dtypes

project_name                       object
description                        object
name_categories                    object
token_name                         object
token_address                      object
token_addresses                    object
date                       datetime64[ns]
proof_link                         object
webarchive_link                    object
funds_lost                        float64
funds_returned                    float64
active                           category
is_verified_source_code          category
is_public_team                   category
scam_type                          object
network                            object
scamNetworks                       object
auditedBy                          object
dtype: object

In [331]:
REKT_df.head()

Unnamed: 0,project_name,description,name_categories,token_name,token_address,token_addresses,date,proof_link,webarchive_link,funds_lost,funds_returned,active,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy
0,Terra Classic,<p><strong>Quick Summary</strong></p><p>A comp...,Stablecoin,"LUNC, USTC","0x7e43d25EaD96B1058f671F6690ea705BA2C7e5B9, 0x...",[],2022-05-08,https://www.nansen.ai/research/on-chain-forens...,,40000000000.0,0.0,1,0.0,1.0,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 1003, 'scam_database_id': 305...",[{'audit_link': 'safe/files/audit/pdf/CertiK_A...
1,Africrypt,<p><strong>Quick Summary</strong></p><p>Ameer ...,CeFi,,,[],2021-06-23,https://finance.yahoo.com/news/africrypt-bitco...,https://web.archive.org/web/20200921145240/htt...,3600000000.0,0.0,1,0.0,1.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]
2,PlusToken,<p><strong>Quick Summary</strong></p><p>The pe...,CeFi,,,[],2019-12-16,https://cointelegraph.com/news/vanuatu-extradi...,https://web.archive.org/web/20220125033211/htt...,2900000000.0,0.0,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]
3,Thodex,<p><strong>Quick Summary</strong></p><p>Thodex...,CeFi,,,[],2021-04-22,https://www.cnbc.com/2021/04/23/bitcoin-btc-ce...,https://web.archive.org/web/20220405133149/htt...,2000000000.0,0.0,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]
4,BitConnect,<p><strong>Quick Summary</strong></p><p>BitCon...,"Borrowing and Lending,CeFi",BCC,,[],2018-01-15,,https://web.archive.org/web/20220426131429/htt...,2000000000.0,56000000.0,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[]


### datetime Data Type

*    ***We shall now extract important time-based features for better EDA experience***

In [332]:
# create time based features for pickup_datetime 
REKT_df['month_of_attack'] = REKT_df.date.dt.month
REKT_df['day_of_week_of_attack'] = REKT_df.date.dt.dayofweek
REKT_df['day_of_year_of_attack'] = REKT_df.date.dt.dayofyear

In [333]:
REKT_df.head()

Unnamed: 0,project_name,description,name_categories,token_name,token_address,token_addresses,date,proof_link,webarchive_link,funds_lost,...,active,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy,month_of_attack,day_of_week_of_attack,day_of_year_of_attack
0,Terra Classic,<p><strong>Quick Summary</strong></p><p>A comp...,Stablecoin,"LUNC, USTC","0x7e43d25EaD96B1058f671F6690ea705BA2C7e5B9, 0x...",[],2022-05-08,https://www.nansen.ai/research/on-chain-forens...,,40000000000.0,...,1,0.0,1.0,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 1003, 'scam_database_id': 305...",[{'audit_link': 'safe/files/audit/pdf/CertiK_A...,5.0,6.0,128.0
1,Africrypt,<p><strong>Quick Summary</strong></p><p>Ameer ...,CeFi,,,[],2021-06-23,https://finance.yahoo.com/news/africrypt-bitco...,https://web.archive.org/web/20200921145240/htt...,3600000000.0,...,1,0.0,1.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],6.0,2.0,174.0
2,PlusToken,<p><strong>Quick Summary</strong></p><p>The pe...,CeFi,,,[],2019-12-16,https://cointelegraph.com/news/vanuatu-extradi...,https://web.archive.org/web/20220125033211/htt...,2900000000.0,...,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],12.0,0.0,350.0
3,Thodex,<p><strong>Quick Summary</strong></p><p>Thodex...,CeFi,,,[],2021-04-22,https://www.cnbc.com/2021/04/23/bitcoin-btc-ce...,https://web.archive.org/web/20220405133149/htt...,2000000000.0,...,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],4.0,3.0,112.0
4,BitConnect,<p><strong>Quick Summary</strong></p><p>BitCon...,"Borrowing and Lending,CeFi",BCC,,[],2018-01-15,,https://web.archive.org/web/20220426131429/htt...,2000000000.0,...,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],1.0,0.0,15.0


In [334]:
REKT_df.tail()

Unnamed: 0,project_name,description,name_categories,token_name,token_address,token_addresses,date,proof_link,webarchive_link,funds_lost,...,active,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy,month_of_attack,day_of_week_of_attack,day_of_year_of_attack
3071,BoobsFinance,,,BOOBS,0xf98f73350d083005079b6c3cda9c99cfdb668be7,[],NaT,,,0.0,...,1,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 78, '...",[],,,
3072,Chad Token,<p>The contract owner could disable the transf...,Token,CHAD,0x2f7383de70c972ef8c18565a9da023b08c110f2c,[],2021-03-08,,,0.0,...,1,,,"{'id': 7, 'type': 'Honeypot'}",{},"[{'networks_id': 56, 'scam_database_id': 77, '...",[],3.0,0.0,67.0
3073,Generate Finance,"<p><span data-sheets-userformat='{""2"":15165,""3...",,GEN,0x04ad13a645748cee762f11e43386fe2a275885b4,[],NaT,https://twitter.com/WARONRUGS/status/136972604...,,0.0,...,1,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 76, '...",[],,,
3074,Pill Finance,"<p><span data-sheets-userformat='{""2"":15165,""3...",,RED-P,0x137faad0d13813ef8d4cbbb336f0e01066b2c9b4,[],NaT,https://twitter.com/RugSteemer/status/13717216...,https://archive.ph/AfyEo,0.0,...,1,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 72, '...",[],,,
3075,SoftdrinkSwap,<p>The team abandoned the project. The team st...,,COLA SODA,0x740426e58a10ac3b6724f753c8468288248137f9,[],2020-11-12,,,0.0,...,1,,,"{'id': 5, 'type': 'Abandoned'}",{},"[{'networks_id': 56, 'scam_database_id': 55, '...",[],11.0,3.0,317.0


In [335]:
REKT_df.dtypes

project_name                       object
description                        object
name_categories                    object
token_name                         object
token_address                      object
token_addresses                    object
date                       datetime64[ns]
proof_link                         object
webarchive_link                    object
funds_lost                        float64
funds_returned                    float64
active                           category
is_verified_source_code          category
is_public_team                   category
scam_type                          object
network                            object
scamNetworks                       object
auditedBy                          object
month_of_attack                   float64
day_of_week_of_attack             float64
day_of_year_of_attack             float64
dtype: object

In [336]:
REKT_df.describe()

Unnamed: 0,funds_lost,funds_returned,month_of_attack,day_of_week_of_attack,day_of_year_of_attack
count,3076.0,3076.0,1200.0,1200.0,1200.0
mean,19831010.0,804770.9,7.344167,2.5775,208.424167
std,728355900.0,16974350.0,3.092791,1.952051,94.929916
min,0.0,0.0,1.0,0.0,1.0
25%,0.0,0.0,5.0,1.0,147.75
50%,0.0,0.0,8.0,3.0,241.0
75%,1324.75,0.0,9.0,4.0,264.0
max,40000000000.0,602189600.0,12.0,6.0,366.0


### Removing HTML Tags from Description variable

In [337]:
REKT_df.loc[REKT_df['description'].isnull(), 'funds_lost']

89      20000000.0
799            0.0
800            0.0
804            0.0
805            0.0
           ...    
3066           0.0
3067           0.0
3069           0.0
3070           0.0
3071           0.0
Name: funds_lost, Length: 285, dtype: float64

In [338]:
REKT_df.loc[REKT_df['description'].isnull()]

Unnamed: 0,project_name,description,name_categories,token_name,token_address,token_addresses,date,proof_link,webarchive_link,funds_lost,...,active,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy,month_of_attack,day_of_week_of_attack,day_of_year_of_attack
89,Yfdex.Finance,,CeFi,YFDEX,0x38f13DAF7d2c7D1a077B064C501f9575BBde7Fa7,[],2020-09-09,https://twitter.com/CryptoIndexFund/status/132...,https://web.archive.org/web/20200907105040/htt...,20000000.0,...,1,,,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1, 'scam_database_id': 42, 'n...",[],9.0,2.0,253.0
799,Mystery,,,MST,0x3308cecce12518ed1a075bbd4eca20273b5b8ff1,[],2021-10-18,https://ftmscan.com/token/0x3308cecce12518ed1a...,,0.0,...,1,,,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 250, 'scam_database_id': 2678...",[],10.0,0.0,291.0
800,FlokiFarmToken,,,FLK,0x710122026bf280f3c99a829f33e9fa4d8df73dc0,[],2021-10-18,https://ftmscan.com/token/0x710122026bf280f3c9...,,0.0,...,1,,,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 250, 'scam_database_id': 2677...",[],10.0,0.0,291.0
804,AwakenedToken,,,AWA,0x98cb26fc9bc01fabeae0a6acd661d8bbbbb863be,[],2021-10-11,https://ftmscan.com/token/0x98cb26fc9bc01fabea...,,0.0,...,1,,,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 250, 'scam_database_id': 2673...",[],10.0,0.0,284.0
805,LionKingToken,,,LKS,0xbdcfc20eb75d52e6a408aba5e16e8999b18a2653,[],2021-10-11,https://ftmscan.com/token/0xbdcfc20eb75d52e6a4...,,0.0,...,1,,,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 250, 'scam_database_id': 2672...",[],10.0,0.0,284.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066,VikingSwap,,,VIKING,0x896eDE222D3f7f3414e136a2791BDB08AAa25Ce0,[],2021-03-07,https://viking-swap.medium.com/an-ode-to-the-w...,,0.0,...,1,,,"{'id': 5, 'type': 'Abandoned'}",{},"[{'networks_id': 56, 'scam_database_id': 97, '...",[],3.0,6.0,66.0
3067,PabloEscobarFi,,,ESCO,0x8671459f9617c818724899cd2c9a76d8018ea90f,[],2021-03-10,https://twitter.com/RugSteemer/status/13695671...,https://archive.ph/sGaO1,0.0,...,1,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 93, '...",[],3.0,2.0,69.0
3069,Chef Swap,,,CHEF,0xf043e56c6cbc666ab285dbfc0db342dd205b468d,[],2021-03-12,,https://archive.ph/Oag6Q,0.0,...,1,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 80, '...",[],3.0,4.0,71.0
3070,OgreSwap,,,OGRE,0x6862b03772640544b4850520f6d5d7ffc0d13194,[],2021-03-12,,https://web.archive.org/web/20210312153650/htt...,0.0,...,1,,,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 56, 'scam_database_id': 79, '...",[],3.0,4.0,71.0


In [339]:
REKT_df.loc[REKT_df['funds_lost']==20000000.0]

Unnamed: 0,project_name,description,name_categories,token_name,token_address,token_addresses,date,proof_link,webarchive_link,funds_lost,...,active,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy,month_of_attack,day_of_week_of_attack,day_of_year_of_attack
89,Yfdex.Finance,,CeFi,YFDEX,0x38f13DAF7d2c7D1a077B064C501f9575BBde7Fa7,[],2020-09-09,https://twitter.com/CryptoIndexFund/status/132...,https://web.archive.org/web/20200907105040/htt...,20000000.0,...,1,,,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1, 'scam_database_id': 42, 'n...",[],9.0,2.0,253.0


In [340]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(REKT_df.iloc[0,1]) # testing performance of the function on description column, first row

print(soup.get_text())

Quick SummaryA complex mixture of events and market dynamics cost the implosion of the $40b Terra (Classic) Network. Details of the ExploitThe Terra Luna Network was focused on its two native coins $LUNC and $USTC. $USTC was the algorithmic Stablecoin that was supposed to hold the peg to 1$ and $LUNC functioned as the satellite asset that absorbed $USTC’s volatility. This was achieved through a mint and burn functionality as well as by arbitrage which the former function inherently enabled.$USTC rose to prominence in rapid fashion even surpassing $DAI in market cap. The Luna Foundation Group decided to start a new Curve4pool with $FRAX, $USTC, $USDC and $USDT as assets, excluding $DAI in an attempt to starve the most used StableCoin pool used by institutions the Curve3pool on the Ethereum network.The migration of $USTC from the Curve3pool is the event that started the bankrun. In early May, the Luna Foundation Guard withdrew 250 million $USTC from the Curve3pool in preparation for the 

In [341]:
# imputing empty strings in all missing vals of description variable instead of dropping those rows

#REKT_df_descr_clean = REKT_df.loc[REKT_df['description'].isnull(), 'funds_lost']
REKT_df['description'].fillna("", inplace=True)
#REKT_df_clean = REKT_df.loc[(REKT_df['description'].notnull() & REKT_df['funds_lost']==20000000.0)]

# An observation that has a high funds lost value but no description must be preserved. So, we convert all of description's missing value to empty string to avoid bs4 errors

# running bs4 on description variable
for index in range(REKT_df.shape[0]):
   soup = BeautifulSoup(REKT_df.iloc[index,1])
   REKT_df.iloc[index,1] = soup.get_text()

In [342]:
REKT_df.head()

Unnamed: 0,project_name,description,name_categories,token_name,token_address,token_addresses,date,proof_link,webarchive_link,funds_lost,...,active,is_verified_source_code,is_public_team,scam_type,network,scamNetworks,auditedBy,month_of_attack,day_of_week_of_attack,day_of_year_of_attack
0,Terra Classic,Quick SummaryA complex mixture of events and m...,Stablecoin,"LUNC, USTC","0x7e43d25EaD96B1058f671F6690ea705BA2C7e5B9, 0x...",[],2022-05-08,https://www.nansen.ai/research/on-chain-forens...,,40000000000.0,...,1,0.0,1.0,"{'id': 19, 'type': 'Other'}",{},"[{'networks_id': 1003, 'scam_database_id': 305...",[{'audit_link': 'safe/files/audit/pdf/CertiK_A...,5.0,6.0,128.0
1,Africrypt,"Quick SummaryAmeer and Raees Cajee, the exchan...",CeFi,,,[],2021-06-23,https://finance.yahoo.com/news/africrypt-bitco...,https://web.archive.org/web/20200921145240/htt...,3600000000.0,...,1,0.0,1.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],6.0,2.0,174.0
2,PlusToken,Quick SummaryThe perpetrators of one of the la...,CeFi,,,[],2019-12-16,https://cointelegraph.com/news/vanuatu-extradi...,https://web.archive.org/web/20220125033211/htt...,2900000000.0,...,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],12.0,0.0,350.0
3,Thodex,Quick SummaryThodex a turkish crypto exchange ...,CeFi,,,[],2021-04-22,https://www.cnbc.com/2021/04/23/bitcoin-btc-ce...,https://web.archive.org/web/20220405133149/htt...,2000000000.0,...,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],4.0,3.0,112.0
4,BitConnect,Quick SummaryBitConnect was a Ponzi Scheme tha...,"Borrowing and Lending,CeFi",BCC,,[],2018-01-15,,https://web.archive.org/web/20220426131429/htt...,2000000000.0,...,1,0.0,0.0,"{'id': 10, 'type': 'Rugpull'}",{},"[{'networks_id': 1666600003, 'scam_database_id...",[],1.0,0.0,15.0


In [343]:
# removing "Quick Summary" in the beginning of valid description values

REKT_df['description'] = REKT_df['description'].apply(lambda x: x.replace("Quick Summary", ""))

In [344]:
REKT_df.iloc[0:10,1]

0    A complex mixture of events and market dynamic...
1    Ameer and Raees Cajee, the exchange's founders...
2    The perpetrators of one of the largest digital...
3    Thodex a turkish crypto exchange went down wit...
4    BitConnect was a Ponzi Scheme that managed to ...
5    WoToken scam took in roughly $1 billion worth ...
6    The Ronin bridge has been exploited for 173,60...
7    Polynetwork was hacked for $602M. The attacker...
8    On Jan. 26, about 523 million NEM (XEM) tokens...
9    One of the world's largest exchanges, Mt. Gox,...
Name: description, dtype: object

In [345]:
REKT_df.shape

(3076, 21)

### Extracting Network Name (eg Ethereum, Binance etc) from scamNetworks variable

In [346]:
import jmespath # testing performance of jmespath on scamNetworks column, first row
tmp = jmespath.search('[].networks.name', REKT_df.scamNetworks[0])
#REKT_df.scamNetworks[1]['networks']
print(tmp)

['Terra Classic']


In [347]:
# using for loop and jmespath to extract network name
clean_network=[]

for i in range(len(REKT_df)):
  clean_network.append(jmespath.search('[].networks.name', REKT_df.scamNetworks[i]))

REKT_df['scamNetworks']=clean_network
REKT_df['scamNetworks'].head()

0    [Terra Classic]
1              [CEX]
2              [CEX]
3              [CEX]
4              [CEX]
Name: scamNetworks, dtype: object

In [348]:
# Exporting full DF as CSV (missing vals present in other columns except Description)

REKT_df.to_csv("../../data/Clean Data/REKT_Database_Clean_Python.csv")


In [349]:
# Exporting first 5 rows of Cleaned DF to HTML for visualization in portfolio

from IPython.display import HTML

html = REKT_df.head(5).to_html()
text_file = open("Clean_Rekt_DF.html", "w")
text_file.write(html)
text_file.close()

*   Now we are done breaking down the date of attack datetime variable into granular forms, as seen above, which will **help us better analyze crypto crimes.** We are also done with the Variable identification and typecasting process and will now start the Univariate Analysis portion of the EDA, followed by Bivariate Analysis, and, lastly, Multivariate Analysis.