In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import plotly

In [3]:
df_bitcoin = pd.read_csv('../Data/BitcoinHeistDataWithMonthAndDate.csv')

In [4]:
df_bitcoin

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label,month,date
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber,1,11
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky,5,11
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber,9,2
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber,11,17
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky,8,25
...,...,...,...,...,...,...,...,...,...,...,...,...
2916692,12D3trgho1vJ4mGtWBRPyHdMJK96TRYSry,2018,330,0,0.111111,1,0,1,1.255809e+09,white,11,26
2916693,1P7PputTcVkhXBmXBvSD9MJ3UYPsiou1u2,2018,330,0,1.000000,1,0,1,4.409699e+07,white,11,26
2916694,1KYiKJEfdJtap9QX2v9BXJMpz2SfU4pgZw,2018,330,2,12.000000,6,6,35,2.398267e+09,white,11,26
2916695,15iPUJsRNZQZHmZZVwmQ63srsmughCXV4a,2018,330,0,0.500000,1,0,1,1.780427e+08,white,11,26


In [5]:
df_bitcoin[df_bitcoin['year'] == 2018]['month'].value_counts()

month
5     33002
9     31001
3     31000
7     31000
8     31000
10    31000
4     30000
6     30000
1     28000
2     28000
11    26000
Name: count, dtype: int64

In [6]:
# sort by year and then by month and then by date
df_bitcoin = df_bitcoin.sort_values(by=['year', 'month', 'date'])

In [7]:
ls_year = df_bitcoin['year'].to_list()
ls_month = df_bitcoin['month'].to_list()
ls_year_month = []
for i in range(len(ls_year)):
    ym = str(ls_year[i]) + '-'
    if ls_month[i] < 10:
        ym += '0'
    ym += str(ls_month[i])
    ls_year_month.append(ym)
df_bitcoin['year_month'] = ls_year_month

In [8]:
min_year_month = df_bitcoin['year_month'].min()
max_year_month = df_bitcoin['year_month'].max()

In [9]:
min_year_month, max_year_month

('2011-01', '2018-11')

In [10]:
# create new dataframe with year_month and label
df_bitcoin_year_month = df_bitcoin[['year_month', 'label']]
df_bitcoin_year_month

# for each year_month, count the number of white and non-white transactions and store in a new dataframe
df_transaction_counts = pd.DataFrame(columns=['year_month', 'white_count', 'non_white_count'])
for year_month in df_bitcoin_year_month['year_month'].unique():
    df_year_month = df_bitcoin_year_month[df_bitcoin_year_month['year_month'] == year_month]
    white_count = df_year_month[df_year_month['label'] == 'white']['label'].count()
    non_white_count = df_year_month[df_year_month['label'] != 'white']['label'].count()
    df_transaction_counts = pd.concat([df_transaction_counts, pd.DataFrame([[year_month, white_count, non_white_count]], columns=['year_month', 'white_count', 'non_white_count'])])
df_transaction_counts['proportion_ransomware'] = df_transaction_counts['non_white_count'] / (df_transaction_counts['white_count'] + df_transaction_counts['non_white_count'])

In [11]:
# print entire df
# pd.set_option('display.max_rows', None)
df_transaction_counts

Unnamed: 0,year_month,white_count,non_white_count,proportion_ransomware
0,2011-01,3147,0,0.0
0,2011-02,26339,0,0.0
0,2011-03,31000,0,0.0
0,2011-04,30000,0,0.0
0,2011-05,31000,0,0.0
...,...,...,...,...
0,2018-07,31000,0,0.0
0,2018-08,31000,0,0.0
0,2018-09,31000,1,0.000032
0,2018-10,31000,0,0.0


In [12]:
ls_proportion_ransomware = df_transaction_counts['proportion_ransomware'].tolist()
mean_proportion_ransomware = np.mean(ls_proportion_ransomware)
std_dev_proportion_ransomware = np.std(ls_proportion_ransomware)

In [13]:
mean_proportion_ransomware, std_dev_proportion_ransomware

(0.034456267045332986, 0.1458550560780503)

In [17]:
# is any month's proportion of ransomware significantly different from the mean?
# null hypothesis: the ransomware proportion for each month is equal to the mean ransomware proportion
# alternative hypothesis: the ransomware proportion for each month is not equal to the mean ransomware proportion

alpha = 0.05
for i in range(len(ls_proportion_ransomware)):
    if ls_proportion_ransomware[i] == 0:
        continue
    z_score = (ls_proportion_ransomware[i] - mean_proportion_ransomware) / (std_dev_proportion_ransomware / 1)
    z_critical = stats.norm.ppf(1 - alpha / 2)
    if abs(z_score) > z_critical:
        print('Month: ' + df_transaction_counts['year_month'].iloc[i] + ' has a significantly different proportion of ransomware transactions than the mean proportion of ransomware transactions')
        print('z-score: ' + str(z_score))

Month: 2014-01 has a significantly different proportion of ransomware transactions than the mean proportion of ransomware transactions
z-score: 6.619885240303106
Month: 2017-01 has a significantly different proportion of ransomware transactions than the mean proportion of ransomware transactions
z-score: 6.619885240303106


In [19]:
# is any year's proportion of ransomware significantly different from the mean?
# null hypothesis: the ransomware proportion for each year is equal to the mean ransomware proportion
# alternative hypothesis: the ransomware proportion for each year is not equal to the mean ransomware proportion

df_transaction_counts['year'] = df_transaction_counts['year_month'].str.split('-').str[0]

ls_proportion_ransomware = []
for year in df_transaction_counts['year'].unique():
    df_year = df_transaction_counts[df_transaction_counts['year'] == year]
    ls_proportion_ransomware.append(df_year['proportion_ransomware'].mean())

mean_proportion_ransomware = np.mean(ls_proportion_ransomware)
std_dev_proportion_ransomware = np.std(ls_proportion_ransomware)

alpha = 0.05
for i in range(len(ls_proportion_ransomware)):
    if ls_proportion_ransomware[i] == 0:
        continue
    z_score = (ls_proportion_ransomware[i] - mean_proportion_ransomware) / (std_dev_proportion_ransomware / 1)
    z_critical = stats.norm.ppf(1 - alpha / 2)
    if abs(z_score) > z_critical:
        print('Year: ' + df_transaction_counts['year'].unique()[i] + ' has a significantly different proportion of ransomware transactions than the mean proportion of ransomware transactions')
        print('z-score: ' + str(z_score))


In [22]:
# is any month's average proportion of ransomware across the years significantly different from the mean?
# null hypothesis: the ransomware proportion for each month is equal to the mean ransomware proportion
# alternative hypothesis: the ransomware proportion for each month is not equal to the mean ransomware proportion

df_transaction_counts['month'] = df_transaction_counts['year_month'].str.split('-').str[1]

ls_proportion_ransomware = []
for month in df_transaction_counts['month'].unique():
    df_month = df_transaction_counts[df_transaction_counts['month'] == month]
    ls_proportion_ransomware.append(df_month['proportion_ransomware'].mean())

mean_proportion_ransomware = np.mean(ls_proportion_ransomware)
std_dev_proportion_ransomware = np.std(ls_proportion_ransomware)

alpha = 0.05

for i in range(len(ls_proportion_ransomware)):
    if ls_proportion_ransomware[i] == 0:
        continue
    z_score = (ls_proportion_ransomware[i] - mean_proportion_ransomware) / (std_dev_proportion_ransomware / 1)
    z_critical = stats.norm.ppf(1 - alpha / 2)
    if abs(z_score) > z_critical:
        print('Month: ' + df_transaction_counts['month'].unique()[i] + ' has a significantly different proportion of ransomware transactions than the mean proportion of ransomware transactions')
        print('z-score: ' + str(z_score))


Month: 01 has a significantly different proportion of ransomware transactions than the mean proportion of ransomware transactions
z-score: 3.3151741240366186
