In [1]:
pip install tabulate



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from tabulate import tabulate
%matplotlib inline

# Custom Font
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Verdana']
rcParams['font.size'] = 14

In [None]:
#Part A.


#Part A-1. To plot the number of transactions each month.

#Read first csv file generated by spark program partA.py.
df_count = pd.read_csv('partA_1_spark.csv')
display(df_count.head())

#generate bar plot year wise.
for year, data in df_count.groupby('Year'):
  #plot a bar graph for each year showing number of transactions per month.
  plt.figure(figsize=(10,8))
  plt.bar(data['Month'], data['No_of_transactions'])
  plt.xlabel(year)
  plt.ylabel('Number of transactions')
  plt.show()

#generate consolidated bar plot.
df_count['Date'] = df_count.Year.astype(str) + '-' + df_count.Month.astype(str)
df_count.drop(['Month','Year'], axis=1, inplace=True)
df_count['No_of_transactions'] = df_count['No_of_transactions'].apply(float)
#display(df_count)

fig, ax = plt.subplots(figsize=(20,6))
ax.bar(df_count['Date'], df_count['No_of_transactions'], color='crimson')
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
plt.xticks(rotation=90)
plt.title('Monthly transactions (2015-2019)')
plt.xlabel('Time')
plt.ylabel('Number of transactions')
ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.5, alpha=0.2)

plt.show()

In [None]:
#Part A-2. To plot the average value of transactions each month.

#Read the second csv file generated by spark program partA.py.
df_avg = pd.read_csv('partA_2_spark.csv')

df_avg['Date'] = df_avg.Year.astype(str) + '-' + df_avg.Month.astype(str)
df_avg.drop(['Month','Year'], axis=1, inplace=True)
df_avg['Average value'] = df_avg['Average value'].apply(float)
display(df_avg)

plt.figure(figsize=(20,6))
plt.bar(df_avg['Date'], df_avg['Average value'])
plt.xticks(rotation=90)
plt.title('Monthly average value of transactions (2015-2019)')
plt.xlabel('Time')
plt.ylabel('Average value of transactions')
plt.show()



In [None]:
#Part B : Top ten most popular services (contracts)

contracts_df = pd.read_csv('partB_spark.csv')
display(contracts_df)
print(tabulate(contracts_df, tablefmt="pretty"))


In [None]:
#Part C : Top ten most active miners

top_miners = pd.read_csv('partC_spark.csv')
display(top_miners)

#Plot a graph showing the total transaction value of top 10 contracts.
sns.barplot(x="Total block size", y="miner", data=top_miners)

# **Part D. Data Exploration (50%)**

Part D-1. Scam Analysis

Popular Scams: Utilising the provided scam dataset, what is the most lucrative form of scam? How does this change throughout time, and does this correlate with certain known scams going offline/inactive? 

In [None]:

daily_scam_df = pd.read_csv('partD-1-3.csv')
daily_scam_df = daily_scam_df.rename({'sum(value)': 'Total value'}, axis='columns')
daily_scam_df['Date'] = pd.to_datetime(daily_scam_df['Date'])
daily_scam_df['Month-Year'] = daily_scam_df['Date'].dt.strftime('%Y-%m')
daily_scam_df['Total value'] = daily_scam_df['Total value'].apply(float)
daily_scam_df.drop('Date', axis=1, inplace=True)

monthly_scam = daily_scam_df.groupby('Month-Year', as_index=False).sum()

plt.figure(figsize=(20,6))
plt.plot(monthly_scam['Month-Year'], monthly_scam['Total value'], marker='o', color='firebrick')
plt.title('Scam transaction values')
plt.xlabel('Time')
plt.ylabel('Montly total value of scams')
plt.xticks(rotation=45)
plt.show()



Part D-2 : Machine learning - price forecasting model

In [None]:

df_price_forecast = pd.read_csv('partD-2.csv')
display(df_price_forecast)

fig, ax = plt.subplots(figsize=(20,8))
ax.plot('Date', 'Adj Close', data=df_price_forecast, color='blue')
ax.plot('Date', 'prediction', data=df_price_forecast, color='red', linestyle='dashed')
ax.set_title('ETH Price Forecast')
ax.set_xlabel('Date')
ax.set_ylabel('Price (USD)')
plt.xticks(rotation=45)
ax.grid(True)
fig.autofmt_xdate()
plt.show()

Part D-5. Miscellaneous Analysis

1. Fork the Chain: There have been several forks of Ethereum in the past. Identify one or more of these and see what effect it had on price and general usage. For example, did a price surge/plummet occur and who profited most from this? 

In [None]:
# Spark source code : partD-5.py
# The spark program generates three CSV files.

#Read first CSV file 'partD_5_1.csv' for mean value of all the transactions around fork date Oct 16, 2017.
df_transaction_mean = pd.read_csv('partD-5-1.csv')
df_transaction_mean = df_transaction_mean.rename({'date_time': 'Date'}, axis='columns')
df_transaction_mean = df_transaction_mean.rename({'avg(value)': 'Average value (Wei)'}, axis='columns')
pd.to_datetime(df_transaction_mean['Date'])
display(df_transaction_mean)
df_transaction_mean['Average value (Wei)'] = df_transaction_mean['Average value (Wei)'].apply(float)


plt.figure(figsize=(15,6))
plt.plot(df_transaction_mean['Date'], df_transaction_mean['Average value (Wei)'], marker='o')
plt.title('Avereage transaction before/after fork')
plt.xlabel('Date')
plt.ylabel('Average trasaction value (Wei)')
plt.xticks(rotation=45)
plt.show()


In [None]:
#Read second CSV file 'partD_5_2.csv' for all transactions in the same period
df_transaction_max = pd.read_csv('partD-5-2.csv')
display(df_transaction_max.head(10))


2. Gas Guzzlers: For any transaction on Ethereum a user must supply gas. How has gas price changed over time? Have contracts become more complicated, requiring more gas, or less so? How does this correlate with your results seen within Part B. 

In [None]:
#PartD-6 : Gas guzzlers.

#Read first CSV file.
df_mean_gas_monthly = pd.read_csv('partD-6.csv')
display(df_mean_gas_monthly)

plt.figure(figsize=(15,6))
plt.plot(df_mean_gas_monthly['MonthYear'], df_mean_gas_monthly['avg(gas_price)'], marker='o', color='coral')
plt.title('Average gas price (monthly)')
plt.xlabel('Time')
plt.ylabel('Average gas price')
plt.xticks(rotation=75)
plt.show()

In [None]:
#PartD-6 : Gas guzzlers 2.

#average gas required by top 10 contracts over time.

df_mean_gas_monthly_top10 = pd.read_csv('partD-6-2.csv')
df_mean_gas_monthly_top10.sort_values(by='MonthYear', ascending=True, inplace=True)
display(df_mean_gas_monthly_top10)

plt.figure(figsize=(15,6))
plt.plot(df_mean_gas_monthly_top10['MonthYear'], df_mean_gas_monthly_top10['avg(gas)'], marker='o', color='darkgreen')
plt.title('Average gas for Top 10 contracts (monthly)')
plt.xlabel('Time')
plt.ylabel('Average gas')
plt.xticks(rotation=75)
plt.show()

In [None]:
#PartD-6 : Gas guzzlers 2.

#average gas required by top 10 contracts over time.

df_mean_gas_monthly_all = pd.read_csv('partD-6-3.csv')
df_mean_gas_monthly_all.sort_values(by='MonthYear', ascending=True, inplace=True)
display(df_mean_gas_monthly_all)


plt.figure(figsize=(15,6))
plt.plot(df_mean_gas_monthly_all['MonthYear'], df_mean_gas_monthly_all['avg(gas)'], marker='o', color='teal')
plt.title('Average gas for all contracts (monthly)')
plt.xlabel('Time')
plt.ylabel('Average gas')
plt.xticks(rotation=75)
plt.show()