In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import plotly

In [2]:
def is_leap_year(year):
    # Function to check if a year is a leap year
    if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):
        return True
    return False

cache = {}

def day_number_to_date(day_number, year):
    if (day_number, year) in cache:
        return cache[(day_number, year)]
    
    days_in_month = [31, 28 if not is_leap_year(year) else 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    
    if day_number < 1 or day_number > sum(days_in_month):
        return None  # Invalid day number
    
    month = 1
    while day_number > days_in_month[month - 1]:
        day_number -= days_in_month[month - 1]
        month += 1
    cache[(day_number, year)] = (month, day_number)
    return month, day_number

# Example usage:
day = 200
year = 2023  # Replace with your desired year
result = day_number_to_date(day, year)

if result:
    month, date = result
    print(f"Day {day} in {year} is in month {month} and the date is {date}.")
else:
    print("Invalid day number.")

Day 200 in 2023 is in month 7 and the date is 19.


In [3]:
df_bitcoin = pd.read_csv('../Data/Prediction_Metrics_Random_Split/SMOTE_FEATURETrans_Predict.csv')

In [4]:
df_bitcoin

Unnamed: 0.1,Unnamed: 0,year,day,length,weight,count,looped,neighbors,income,labelnew,KNeighborsClassifier_pred,DecisionTreeClassifier_pred,RandomForestClassifier_pred,MLPClassifier_pred,GaussianNB_pred,QuadraticDiscriminantAnalysis_pred
0,0,2017.0,11.0,2.441734,0.008264,0.493543,0.0,2.0,4,1,0,1,0,0,0,0
1,1,2016.0,132.0,2.995381,0.000244,0.493543,0.0,1.0,4,1,1,1,1,1,1,0
2,2,2016.0,246.0,0.000000,0.500000,0.493543,0.0,2.0,4,1,1,1,0,0,0,0
3,3,2016.0,322.0,3.279639,0.003891,0.493543,0.0,2.0,4,1,1,1,0,0,0,0
4,4,2016.0,238.0,3.653094,0.067902,0.957636,0.0,1.0,4,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808025,1808025,2014.0,194.0,1.596469,0.232274,0.733154,0.0,3.0,4,1,1,1,1,0,1,0
1808026,1808026,2014.0,96.0,3.309891,0.209102,0.958193,0.0,1.0,4,1,1,1,1,1,1,0
1808027,1808027,2014.0,199.0,1.450653,0.308806,0.654081,0.0,2.0,4,1,1,1,1,1,1,0
1808028,1808028,2014.0,14.0,0.000000,0.461452,0.493543,0.0,1.0,4,1,1,1,1,0,0,0


In [5]:
df_bitcoin['KNeighborsClassifier_pred'].value_counts()

KNeighborsClassifier_pred
1    1007133
0     800897
Name: count, dtype: int64

In [6]:
# convert day and year to int
df_bitcoin['day'] = df_bitcoin['day'].astype(int)
df_bitcoin['year'] = df_bitcoin['year'].astype(int)

In [7]:
df_bitcoin['month'] = df_bitcoin.apply(lambda row: day_number_to_date(row['day'], row['year'])[0], axis=1)
df_bitcoin['date'] = df_bitcoin.apply(lambda row: day_number_to_date(row['day'], row['year'])[1], axis=1)

In [8]:
df_bitcoin[df_bitcoin['year'] == 2018]['month'].value_counts()

month
10    10781
8     10748
9     10534
7     10398
3     10160
5     10102
6     10074
4      9863
2      9537
11     9438
1         3
Name: count, dtype: int64

In [9]:
# sort by year and then by month and then by date
df_bitcoin = df_bitcoin.sort_values(by=['year', 'month', 'date'])

In [10]:
ls_year = df_bitcoin['year'].to_list()
ls_month = df_bitcoin['month'].to_list()
ls_year_month = []
for i in range(len(ls_year)):
    ym = str(ls_year[i]) + '-'
    if ls_month[i] < 10:
        ym += '0'
    ym += str(ls_month[i])
    ls_year_month.append(ym)
df_bitcoin['year_month'] = ls_year_month

In [11]:
# randomly sample 10000 rows
df_bitcoin_sample = df_bitcoin.sample(n=10000, random_state=1)

In [12]:
min_year_month = df_bitcoin['year_month'].min()
max_year_month = df_bitcoin['year_month'].max()

min_year_month_sample = df_bitcoin_sample['year_month'].min()
max_year_month_sample = df_bitcoin_sample['year_month'].max()

In [13]:
min_year_month, max_year_month, min_year_month_sample, max_year_month_sample

('2011-02', '2018-11', '2011-02', '2018-11')

In [14]:
# create new dataframe with year_month and label
df_bitcoin_year_month = df_bitcoin[['year_month', 'KNeighborsClassifier_pred']]
df_bitcoin_year_month

# for each year_month, count the number of white and non-white transactions and store in a new dataframe
df_transaction_counts = pd.DataFrame(columns=['year_month', 'white_count', 'non_white_count'])
for year_month in df_bitcoin_year_month['year_month'].unique():
    df_year_month = df_bitcoin_year_month[df_bitcoin_year_month['year_month'] == year_month]
    white_count = df_year_month[df_year_month['KNeighborsClassifier_pred'] == 0]['KNeighborsClassifier_pred'].count()
    non_white_count = df_year_month[df_year_month['KNeighborsClassifier_pred'] != 0]['KNeighborsClassifier_pred'].count()
    df_transaction_counts = pd.concat([df_transaction_counts, pd.DataFrame([[year_month, white_count, non_white_count]], columns=['year_month', 'white_count', 'non_white_count'])])
df_transaction_counts['proportion_ransomware'] = df_transaction_counts['non_white_count'] / (df_transaction_counts['white_count'] + df_transaction_counts['non_white_count'])

In [15]:
# create new dataframe with year_month and label
df_bitcoin_sample_year_month = df_bitcoin_sample[['year_month', 'KNeighborsClassifier_pred']]
df_bitcoin_sample_year_month

# for each year_month, count the number of white and non-white transactions and store in a new dataframe
df_transaction_counts_sample = pd.DataFrame(columns=['year_month', 'white_count', 'non_white_count'])
for year_month in df_bitcoin_sample_year_month['year_month'].unique():
    df_year_month = df_bitcoin_sample_year_month[df_bitcoin_sample_year_month['year_month'] == year_month]
    white_count = df_year_month[df_year_month['KNeighborsClassifier_pred'] == 0]['KNeighborsClassifier_pred'].count()
    non_white_count = df_year_month[df_year_month['KNeighborsClassifier_pred'] != 0]['KNeighborsClassifier_pred'].count()
    df_transaction_counts_sample = pd.concat([df_transaction_counts_sample, pd.DataFrame([[year_month, white_count, non_white_count]], columns=['year_month', 'white_count', 'non_white_count'])])
df_transaction_counts_sample['proportion_ransomware'] = df_transaction_counts_sample['non_white_count'] / (df_transaction_counts_sample['white_count'] + df_transaction_counts_sample['non_white_count'])

In [16]:
ls_proportion_ransomware = df_transaction_counts['proportion_ransomware'].tolist()
mean_proportion_ransomware = np.mean(ls_proportion_ransomware)
std_dev_proportion_ransomware = np.std(ls_proportion_ransomware)

ls_proportion_ransomware_sample = df_transaction_counts_sample['proportion_ransomware'].tolist()
mean_proportion_ransomware_sample = np.mean(ls_proportion_ransomware_sample)
std_dev_proportion_ransomware_sample = np.std(ls_proportion_ransomware_sample)

In [17]:
mean_proportion_ransomware, std_dev_proportion_ransomware, mean_proportion_ransomware_sample, std_dev_proportion_ransomware_sample

(0.34460297212722185,
 0.320312404520643,
 0.351863662625261,
 0.32096869920174964)

In [18]:
df_gtrend_bitcoin = pd.read_csv('../Data/GoogleTrends/Bitcoin.csv')
df_gtrend_crypto = pd.read_csv('../Data/GoogleTrends/Cryptocurrency.csv')
df_gtrend_ransomware = pd.read_csv('../Data/GoogleTrends/Ransomware.csv')

In [19]:
df_gtrend_bitcoin['Category: All categories']['Month']

'bitcoin: (Worldwide)'

In [20]:
gtrend_bitcoin_dict = df_gtrend_bitcoin['Category: All categories'].to_dict()
gtrend_crypto_dict = df_gtrend_crypto['Category: All categories'].to_dict()
gtrend_ransomware_dict = df_gtrend_ransomware['Category: All categories'].to_dict()

# remove first entry
gtrend_bitcoin_dict.pop('Month')
gtrend_crypto_dict.pop('Month')
gtrend_ransomware_dict.pop('Month')


'ransomware: (Worldwide)'

In [21]:
# change all values which say '<1' to 0
for key, value in gtrend_bitcoin_dict.items():
    if value == '<1':
        gtrend_bitcoin_dict[key] = 0

for key, value in gtrend_crypto_dict.items():
    if value == '<1':
        gtrend_crypto_dict[key] = 0

for key, value in gtrend_ransomware_dict.items():
    if value == '<1':
        gtrend_ransomware_dict[key] = 0

In [22]:
# change all values to int
for key, value in gtrend_bitcoin_dict.items():
    gtrend_bitcoin_dict[key] = int(value)

for key, value in gtrend_crypto_dict.items():
    gtrend_crypto_dict[key] = int(value)

for key, value in gtrend_ransomware_dict.items():
    gtrend_ransomware_dict[key] = int(value)

In [23]:
# remove keys which are not in the range of the bitcoin dataset
for key in list(gtrend_bitcoin_dict.keys()):
    min_year = int(min_year_month.split('-')[0])
    max_year = int(max_year_month.split('-')[0])
    min_month = int(min_year_month.split('-')[1])
    max_month = int(max_year_month.split('-')[1])
    year = int(key.split('-')[0])
    month = int(key.split('-')[1])
    if year < min_year or year > max_year:
        gtrend_bitcoin_dict.pop(key)
    elif year == min_year and month < min_month:
        gtrend_bitcoin_dict.pop(key)
    elif year == max_year and month > max_month:
        gtrend_bitcoin_dict.pop(key)

for key in list(gtrend_crypto_dict.keys()):
    min_year = int(min_year_month.split('-')[0])
    max_year = int(max_year_month.split('-')[0])
    min_month = int(min_year_month.split('-')[1])
    max_month = int(max_year_month.split('-')[1])
    year = int(key.split('-')[0])
    month = int(key.split('-')[1])
    if year < min_year or year > max_year:
        gtrend_crypto_dict.pop(key)
    elif year == min_year and month < min_month:
        gtrend_crypto_dict.pop(key)
    elif year == max_year and month > max_month:
        gtrend_crypto_dict.pop(key)

for key in list(gtrend_ransomware_dict.keys()):
    min_year = int(min_year_month.split('-')[0])
    max_year = int(max_year_month.split('-')[0])
    min_month = int(min_year_month.split('-')[1])
    max_month = int(max_year_month.split('-')[1])
    year = int(key.split('-')[0])
    month = int(key.split('-')[1])
    if year < min_year or year > max_year:
        gtrend_ransomware_dict.pop(key)
    elif year == min_year and month < min_month:
        gtrend_ransomware_dict.pop(key)
    elif year == max_year and month > max_month:
        gtrend_ransomware_dict.pop(key)

In [24]:
# find top 5 keys with highest values
top_10_bitcoin_gtrend = sorted(gtrend_bitcoin_dict, key=gtrend_bitcoin_dict.get, reverse=True)[:10]
top_10_crypto_gtrend = sorted(gtrend_crypto_dict, key=gtrend_crypto_dict.get, reverse=True)[:10]
top_10_ransomware_gtrend = sorted(gtrend_ransomware_dict, key=gtrend_ransomware_dict.get, reverse=True)[:10]

# find top 5 values
top_10_bitcoin_gtrend_values = sorted(gtrend_bitcoin_dict.values(), reverse=True)[:10]
top_10_crypto_gtrend_values = sorted(gtrend_crypto_dict.values(), reverse=True)[:10]
top_10_ransomware_gtrend_values = sorted(gtrend_ransomware_dict.values(), reverse=True)[:10]

## Bitcoin

In [25]:
# find entries in df_transaction_counts which correspond to the top 5 bitcoin gtrend values
df_transaction_counts_sample_top_10_bitcoin = df_transaction_counts_sample[df_transaction_counts_sample['year_month'].isin(top_10_bitcoin_gtrend)]

# perform Z-test on the top 5 bitcoin gtrend values
ls_proportion_ransomware_sample_top_5_bitcoin = df_transaction_counts_sample_top_10_bitcoin['proportion_ransomware'].tolist()
mean_proportion_ransomware_sample_top_5_bitcoin = np.mean(ls_proportion_ransomware_sample_top_5_bitcoin)
std_dev_proportion_ransomware_sample_top_5_bitcoin = np.std(ls_proportion_ransomware_sample_top_5_bitcoin)

sample_size = len(ls_proportion_ransomware_sample_top_5_bitcoin)
sample_mean = mean_proportion_ransomware_sample_top_5_bitcoin
population_mean = mean_proportion_ransomware_sample
population_std_dev = std_dev_proportion_ransomware_sample
alpha = 0.05

z_score = (sample_mean - population_mean) / (population_std_dev / np.sqrt(sample_size))
z_critical = stats.norm.ppf(1 - alpha)

print('z_score:', z_score)
print('z_critical:', z_critical)

if z_score > z_critical:
    print('Reject null hypothesis')
else:
    print('Fail to reject null hypothesis')

z_score: -3.0182291268268013
z_critical: 1.6448536269514722
Fail to reject null hypothesis


In [26]:
# find entries in df_transaction_counts which correspond to the top 5 bitcoin gtrend values
df_transaction_counts_top_10_bitcoin = df_transaction_counts[df_transaction_counts['year_month'].isin(top_10_bitcoin_gtrend)]

# perform Z-test on the top 5 bitcoin gtrend values
ls_proportion_ransomware_top_5_bitcoin = df_transaction_counts_top_10_bitcoin['proportion_ransomware'].tolist()
mean_proportion_ransomware_top_5_bitcoin = np.mean(ls_proportion_ransomware_top_5_bitcoin)
std_dev_proportion_ransomware_top_5_bitcoin = np.std(ls_proportion_ransomware_top_5_bitcoin)

sample_size = len(ls_proportion_ransomware_top_5_bitcoin)
sample_mean = mean_proportion_ransomware_top_5_bitcoin
population_mean = mean_proportion_ransomware
population_std_dev = std_dev_proportion_ransomware
alpha = 0.05

z_score = (sample_mean - population_mean) / (population_std_dev / np.sqrt(sample_size))
z_critical = stats.norm.ppf(1 - alpha)

print('z_score:', z_score)
print('z_critical:', z_critical)

if z_score > z_critical:
    print('Reject null hypothesis')
else:
    print('Fail to reject null hypothesis')

z_score: -3.1848020711534613
z_critical: 1.6448536269514722
Fail to reject null hypothesis


## Crypto

In [27]:
# find entries in df_transaction_counts which correspond to the top 5 crypto gtrend values
df_transaction_counts_sample_top_10_crypto = df_transaction_counts_sample[df_transaction_counts_sample['year_month'].isin(top_10_crypto_gtrend)]

# perform Z-test on the top 5 crypto gtrend values
ls_proportion_ransomware_sample_top_10_crypto = df_transaction_counts_sample_top_10_crypto['proportion_ransomware'].tolist()
mean_proportion_ransomware_sample_top_10_crypto = np.mean(ls_proportion_ransomware_sample_top_10_crypto)
std_dev_proportion_ransomware_sample_top_10_crypto = np.std(ls_proportion_ransomware_sample_top_10_crypto)

sample_size = len(ls_proportion_ransomware_sample_top_10_crypto)
sample_mean = mean_proportion_ransomware_sample_top_10_crypto
population_mean = mean_proportion_ransomware
population_std_dev = std_dev_proportion_ransomware
alpha = 0.05

z_score = (sample_mean - population_mean) / (population_std_dev / np.sqrt(sample_size))
z_critical = stats.norm.ppf(1 - alpha)

print('z_score:', z_score)
print('z_critical:', z_critical)

if z_score > z_critical:
    print('Reject null hypothesis')
else:
    print('Fail to reject null hypothesis')

z_score: -3.1027521607339548
z_critical: 1.6448536269514722
Fail to reject null hypothesis


In [28]:
# find entries in df_transaction_counts which correspond to the top 5 crypto gtrend values
df_transaction_counts_top_10_crypto = df_transaction_counts[df_transaction_counts['year_month'].isin(top_10_crypto_gtrend)]

# perform Z-test on the top 5 crypto gtrend values
ls_proportion_ransomware_top_10_crypto = df_transaction_counts_top_10_crypto['proportion_ransomware'].tolist()
mean_proportion_ransomware_top_10_crypto = np.mean(ls_proportion_ransomware_top_10_crypto)
std_dev_proportion_ransomware_top_10_crypto = np.std(ls_proportion_ransomware_top_10_crypto)

sample_size = len(ls_proportion_ransomware_top_10_crypto)
sample_mean = mean_proportion_ransomware_top_10_crypto
population_mean = mean_proportion_ransomware
population_std_dev = std_dev_proportion_ransomware
alpha = 0.05

z_score = (sample_mean - population_mean) / (population_std_dev / np.sqrt(sample_size))
z_critical = stats.norm.ppf(1 - alpha)

print('z_score:', z_score)
print('z_critical:', z_critical)

if z_score > z_critical:
    print('Reject null hypothesis')
else:
    print('Fail to reject null hypothesis')

z_score: -3.2857370636258922
z_critical: 1.6448536269514722
Fail to reject null hypothesis


## Ransomware

In [29]:
# find entries in df_transaction_counts which correspond to the top 5 ransomware gtrend values
df_transaction_counts_top_10_ransomware_sample = df_transaction_counts_sample[df_transaction_counts_sample['year_month'].isin(top_10_ransomware_gtrend)]

# perform Z-test on the top 10 ransomware gtrend values
ls_proportion_ransomware_sample_top_10_ransomware = df_transaction_counts_top_10_ransomware_sample['proportion_ransomware'].tolist()
mean_proportion_ransomware_sample_top_10_ransomware = np.mean(ls_proportion_ransomware_sample_top_10_ransomware)
std_dev_proportion_ransomware_sample_top_10_ransomware = np.std(ls_proportion_ransomware_sample_top_10_ransomware)

sample_size = len(ls_proportion_ransomware_sample_top_10_ransomware)
sample_mean = mean_proportion_ransomware_sample_top_10_ransomware
population_mean = mean_proportion_ransomware
population_std_dev = std_dev_proportion_ransomware
alpha = 0.05

z_score = (sample_mean - population_mean) / (population_std_dev / np.sqrt(sample_size))
z_critical = stats.norm.ppf(1 - alpha)

print('z_score:', z_score)
print('z_critical:', z_critical)

if z_score > z_critical:
    print('Reject null hypothesis')
else:
    print('Fail to reject null hypothesis')

z_score: 1.5419853838178637
z_critical: 1.6448536269514722
Fail to reject null hypothesis


In [30]:
# find entries in df_transaction_counts which correspond to the top 5 ransomware gtrend values
df_transaction_counts_top_10_ransomware = df_transaction_counts[df_transaction_counts['year_month'].isin(top_10_ransomware_gtrend)]

# perform Z-test on the top 10 ransomware gtrend values
ls_proportion_ransomware_top_10_ransomware = df_transaction_counts_top_10_ransomware['proportion_ransomware'].tolist()
mean_proportion_ransomware_top_10_ransomware = np.mean(ls_proportion_ransomware_top_10_ransomware)
std_dev_proportion_ransomware_top_10_ransomware = np.std(ls_proportion_ransomware_top_10_ransomware)

sample_size = len(ls_proportion_ransomware_top_10_ransomware)
sample_mean = mean_proportion_ransomware_top_10_ransomware
population_mean = mean_proportion_ransomware
population_std_dev = std_dev_proportion_ransomware
alpha = 0.05

z_score = (sample_mean - population_mean) / (population_std_dev / np.sqrt(sample_size))
z_critical = stats.norm.ppf(1 - alpha)

print('z_score:', z_score)
print('z_critical:', z_critical)

if z_score > z_critical:
    print('Reject null hypothesis')
else:
    print('Fail to reject null hypothesis')

z_score: 1.5853705504527267
z_critical: 1.6448536269514722
Fail to reject null hypothesis
