In [34]:
import csv
import numpy as np
import statistics
import matplotlib.pyplot as plt

In [35]:
# Reading in my CSV
data_file = open("litreleases_ponzi_analysis_UPDATED.csv", "rU") 

  


In [36]:
# convert to DictReader
# convert rows to dictionaries, so we can pull data by field name
data_reader = csv.DictReader(data_file)


In [37]:
print(data_reader.fieldnames[10])

Investors_Int


In [38]:
# Build "average amount of money lost per investor" for each ponzi scheme/row
average_lost_per_investor = [] # empty list that will become new column

# Set up exception block
for row in data_reader:
  try:
    # DictReader creates a list of column headers called "fieldnames"
    # I will use the indices of this to grab the column value and cast it as a float
    investors_number = float(row[data_reader.fieldnames[10]])
    money_raised = float(row[data_reader.fieldnames[12]])
    average_lost_per_investor.append((money_raised / investors_number) * 1000000) # multiplying by 1000000 here to convert units from millions to dollars
  
  except Exception:
    pass



In [39]:
# Customizing the appearance of my plot
plt.rcdefaults()
plt.rcParams.update({
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.facecolor' : '#e8e8e8',
    'axes.facecolor' : '#e8e8e8',
    'axes.spines.left'   : True,   
    'axes.spines.bottom' : True,
    'axes.spines.top'    : False,
    'axes.spines.right' : False,
    'axes.linewidth'     : 0.25 
})


In [41]:
# plot histogram of average amount lost per investor in each scheme
my_hist = plt.hist(average_lost_per_investor, histtype="bar", color="#464555")

# calculate quartiles
quartiles = np.percentile(average_lost_per_investor, [25, 50, 75])
iqr = quartiles[2] - quartiles[0]
upper_bound = quartiles[2] + (1.5*iqr)
lower_bound = quartiles[0] - (1.5*iqr)

# plot the quartiles in orange
plt.vlines(quartiles, 0, 250, color=['#ffc75f', '#ffc75f', '#ffc75f'], linestyle='dashed', linewidth=1)

# plot the upper and lower bounds in orange
plt.vlines([upper_bound, lower_bound], 0, 250, color=['#ffc75f','#ffc75f'], linestyle='dashed', linewidth=1)

# calculate mean and std deviation lines
data_mean = statistics.mean(average_lost_per_investor)
std_dev = np.std(average_lost_per_investor)

# build datapoints for showing std deviations
mean_measures = [data_mean]
for i in range(4):
  mean_measures.append(data_mean + (i*std_dev))
  mean_measures.append(data_mean - (i*std_dev))

# plot the mean & std deviation lines in pink
plt.vlines(mean_measures, 0, 250, color=['#ff6f91', '#ff6f91', '#ff6f91'], linestyle='dashed', linewidth=1)

# print the values in the terminal
print('The mean is $', round(statistics.mean(average_lost_per_investor)), "lost per investor")
print('The median is $', round(statistics.median(average_lost_per_investor)), "lost per investor")
print('The 25%, 50%, and 75% quartile values are', np.percentile(average_lost_per_investor, [25, 50, 75]))
print('The mean and standard deviation values are', mean_measures)

# # show the plot
# plt.show()


The mean is $ 523213 lost per investor
The median is $ 150000 lost per investor
The 25%, 50%, and 75% quartile values are [ 66666.66666667 150000.         290322.58064516]
The mean and standard deviation values are [523212.58774180815, 523212.58774180815, 523212.58774180815, 2764989.2407391835, -1718564.0652555674, 5006765.893736559, -3960340.718252943, 7248542.546733934, -6202117.371250318]
