In [1]:
import cymetric as cym
import pandas as pd
#import sklearn as skl
import math

import numpy as np
import random
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (40,24)
from cymetric import graphs as cgr
from cymetric import timeseries as tm
from cymetric import filters as fl

In [2]:
file = "run1.sqlite"
db = cym.dbopen(file)
ev = cym.Evaluator(db=db, write=True)
#cym.graphs.flow_graph(ev, label = "mass")

#### extract data from the sqlite file

In [38]:
def tidy_df(file):
    db = cym.dbopen(file)
    ev = cym.Evaluator(db=db, write=True)
    
    agentTable = ev.eval("AgentEntry")
    try: 
        agents = agentTable.loc[:, ["AgentId", "Prototype"]]
    except: 
        print("there was an agent issue with " + file)
    
    transTable = ev.eval("Transactions")
    try: 
        transactions = transTable.loc[:, ["SenderId", "ReceiverId", "ResourceId", "Commodity", "Time"]]
    except: 
        print("there was an transactions issue with " + file)
    
    resourceTable = ev.eval("Resources")
    try: 
        resources = resourceTable.loc[:, ["ResourceId", "Quantity"]]
    except:
        print("there was a resources issue with " + file)
    
#     reactorTable= ev.eval("ReactorEvents")
#     try: 
#         reactor_events = reactorTable.loc[:, ["Time", "Event"]]
#         emissions = reactor_events.loc[reactor_events["Event"] == "DISCHARGE"]
#         print("EMISSIONS", emissions)
#     except: 
#         print("there was a reactor issue with " + file)
    
    #merge agents, transactions, and resources
    int1 = pd.merge(transactions, resources, on='ResourceId', how='inner')

    #rename AgentId column to facilitate merge 
    send = agents.rename(columns = {"AgentId": "SenderId"})
    receive = agents.rename(columns = {"AgentId": "ReceiverId"})

    for i in range(len(int1)):
        for j in range(len(send)):
            if int1.loc[i,"SenderId"] == send.loc[j, "SenderId"]:
                int1.loc[i,"SenderId"] = send.loc[j, "Prototype"]
                
    for i in range(len(int1)):
        for j in range(len(receive)):
            if int1.loc[i,"ReceiverId"] == receive.loc[j, "ReceiverId"]:
                int1.loc[i,"ReceiverId"] = receive.loc[j, "Prototype"]

                
    print(send)
    print(int1)
    return int1
    

#### trim resource-identifying columns

In [4]:
def trim_data(df):
    trimmed = df[["SenderId", "ReceiverId", "Time", "Quantity"]]
    trimmed["fraction"] = pd.Series(0, index = range(316)) #modular? 
    trimmed["truck"] = pd.Series(0, index = range(316))
    
    #assume leu and heu enrichment happens in the same physical facility
    #drop rows with transactions between enrichment facilities
    short = trimmed.loc[(trimmed['SenderId'] != "LEUenrich") & (trimmed['ReceiverId'] != "LEUtoHEUenrich")]
    
    #change the name of enrichment facilites 
    short["SenderId"] = short['SenderId'].replace({'LEUenrich': 'enrichment', 'LEUtoHEUenrich': 'enrichment'})
    short['ReceiverId'] = short['ReceiverId'].replace({'LEUenrich': 'enrichment', 'LEUtoHEUenrich': 'enrichment'})
    
    return short

#### define truck

for each transaction, cap shipment at a certain mass m ( = 25000kg? is realisitc, but a smaller value might be more interesting)
create a new column with the number of trucks sent for this interaction (0 + )
consider storing heu for several timesteps


In [5]:
from collections import Counter
def send_trucks(truck_df, truck_size): 
    truck_df["fraction"] = truck_df["Quantity"] / truck_size #reevaluate this if you want the trucks to have different capacities
    
    #collect set of all transaction types: 
    transaction_pairs = Counter()
    for i in range(len(truck_df)): 
        #add this transaction to the counter
        #check if the value for that type of transaction is greater than the value for the truck 
        #if so, send a truck and update the value for the transaction type
        #if not, update the value for the transaction type 
        #0: senderid, 1: receiverid, 2: time, 3: quantity, 4: fraction, 5: truck
        sender = truck_df.iloc[i, 0]
        receiver = truck_df.iloc[i, 1]
        transaction_pairs.update({(sender, receiver): truck_df.iloc[i, 3]})
        stored_material = transaction_pairs[(sender, receiver)]
        if stored_material >= truck_size: 
            trucks = stored_material // truck_size
            truck_df.at[i, "truck"] = trucks
            transaction_pairs[(sender, receiver)] -= trucks * truck_size
        else: 
            truck_df.at[i, "truck"] = 0
        
            
    print(transaction_pairs)
    print(truck_df)
        
    return truck_df
    

#### flatten data into single row for use with other runs

columns: every perumutation of transaction (truck boolean), every timestamp

row: single cyclus run 

transactions: mine to enrichment, enrichment to reactor, reactor to sf sink, enrichment to heu sink

#### modify this to look for its own facilities

In [45]:
import numpy as np
import pandas as pd

def isotope_signal(filename): #parameters: duration, dt, some transaction table (check for reactor refueling)
    #converted to seconds:
    half131m = (11.9*24*60*60) #days
    half133 = (5.25*24*60*60) #days
    half133m = (2.19*24*60*60) #days
    half135 = (9.10*60*60) #hours
    time_step = (30*24*60*60)
    
    l131m = -np.log(2) / half131m
    l133 = -np.log(2) / half133
    l133m = -np.log(2) / half133m
    l135 = -np.log(2) / half135
    
    tidydf = tidy_df(filename)

    #find all of the reactor cycle starts 
    #calculate ratios for each t in cycle
    isotope_rows = []
    '''
    for time step in time steps: 
        if before first delivery: 
            add only background (start with 0)
            
        calculate new ratios from previous time step
        
        if end of reactor cycle: 
            create new initial signal (n for each of the 4 isotopes)
            calculate ratios
            add to variable
        
        append ratios to row 
    '''
    duration = int(tidydf["Time"].max())
    print(duration)
    cycle_ends = tidydf[tidydf["SenderId"] == "LWR"]["Time"].to_list()
    print(cycle_ends)
    print(type(cycle_ends[0]))
    
    for i, t in enumerate(range(duration+1)): 
        row = {"135/133m": 0, "135/133": 0, "135/131m": 0, \
               "133m/133": 0, "133m/131m": 0, "133/131m": 0}
        
        if t >= cycle_ends[0]: #every time the reactor sends material elsewhere
            previous_row = isotope_rows[i-1]
            #how many half lives in 30 days? 
            #N(t) = N(0)/2 ** (t/thalf)
            #Rn/m(t) = Rn/m(0)e^-(Ln - Lm)t
            row["135/133m"] = previous_row["135/133m"] * np.exp(-(l135-l133m)*t)
            row["135/133"] = previous_row["135/133"] * np.exp(-(l135-l133)*t)
            row["135/131m"] = previous_row["135/131m"] * np.exp(-(l135-l131m)*t)
            row["133m/133"] = previous_row["133m/133"] * np.exp(-(l133m-l133)*t)
            row["133m/131m"] = previous_row["133m/131m"] * np.exp(-(l133m-l131m)*t)
            row["133/131m"] = previous_row["133/131m"] * np.exp(-(l133-l131m)*t)
    
            if t in cycle_ends: #first isotopes released after first cycle
                #multiply each by random variable
                row["135/133m"]  += 607
                row["135/133"]   += 66.4
                row["135/131m"]  += 220000
                row["133m/133"]  += 0.109
                row["133m/131m"] += 363
                row["133/131m"]  += 3320

        isotope_rows.append(row)  
        
    isotope_columns = []
    
    for t in range(144): 
        for key, value in isotope_rows[t].items(): 
            isotope_columns.append({f'{key}_t{t}': value})

    isotope_df = pd.DataFrame(isotope_columns)
    long_row = isotope_df.sum().to_frame().T
    
    return long_row
 
test = isotope_signal("run1.sqlite")
print(test)


#turn the isotope signal into a 1 row by 6*timestep column df that can be appended to trucks df 
# isotope_columns = []
# for t in range(144): 
#     for key, value in test[t].items(): 
#         isotope_columns.append({f'{key}_t{t}': value})
        
# print(len(isotope_columns))
# #print(isotope_columns)
# isotope_df = pd.DataFrame(isotope_columns)
# # isotope_df = pd.Series(isotope_columns)
# sums = isotope_df.sum().to_frame().T
# print(isotope_df)
# print(sums)                                           
# print(type(sums))


   SenderId          Prototype
0        20  SimpleHiderRegion
1        21               inst
2        22        UraniumMine
3        23          LEUenrich
4        24            reactor
5        25      SpentFuelSink
6        26     LEUtoHEUenrich
7        27            HEUSink
           SenderId      ReceiverId  ResourceId Commodity  Time      Quantity
0       UraniumMine       LEUenrich          11    c_uore     0  10000.000000
1         LEUenrich  LEUtoHEUenrich          30     c_leu     1    100.000000
2       UraniumMine       LEUenrich          32    c_uore     1  10000.000000
3       UraniumMine       LEUenrich          48    c_uore     2  10000.000000
4    LEUtoHEUenrich         HEUSink          51     c_heu     2      4.124861
..              ...             ...         ...       ...   ...           ...
311     UraniumMine       LEUenrich        2233    c_uore   141  10000.000000
312     UraniumMine       LEUenrich        2242    c_uore   142  10000.000000
313  LEUtoHEUenrich

In [7]:
def make_cols(max_time):
    col_names = ["diversion"]
    #edit so that transactions include all but the heu going to the heusink 
#     transactions = {("UraniumMine", "enrichment"), ("enrichment", "reactor"), 
#                     ("reactor", "SpentFuelSink"), }#("enrichment", "HEUSink")}
    #["mine--enrich", "enrich--reactor", "enrich--heusink", "reactor--sfsink"]
#         ('milling', 'conversion'),('mil_enrichment', 'mil_str_u_dep'),('civ_enrichment', 'civ_str_u_dep'), 
#         ('conversion', 'mil_uox_fabrication'), ('civ_enrichment', 'civ_fabrication'), 
#         ('mine', 'milling'), ('conversion', 'civ_enrichment'), ('mil_enrichment', 'mil_str_fiss'), 
#         ('conversion', 'mil_enrichment')
    transactions = {
          ('civ_enrichment', 'civ_str_u_dep'), ('mine', 'milling'), ('milling', 'conversion'), 
          ('civ_enrichment', 'civ_fabrication'), ('conversion', 'civ_enrichment')
    }
    
    for t in range(max_time):
        for trans in sorted(transactions): 
            col_names.append(trans[0] + "--" + trans[1] + "|time" + str(t))
    return col_names
 
def make_row(truckdf, max_time):
    long_row = []
    #long_row.append("HEUSink" in truckdf["ReceiverId"].tolist())
    long_row.append("mil_enrichment" in truckdf["ReceiverId"].tolist())
    # mil_enrichment
#         ('milling', 'conversion'),('mil_enrichment', 'mil_str_u_dep'),('civ_enrichment', 'civ_str_u_dep'), 
#         ('conversion', 'mil_uox_fabrication'), ('civ_enrichment', 'civ_fabrication'), 
#         ('mine', 'milling'), ('conversion', 'civ_enrichment'), ('mil_enrichment', 'mil_str_fiss'), 
#         ('conversion', 'mil_enrichment')
    transactions = {
        #("UraniumMine", "enrichment"), ("enrichment", "reactor"), 
                    #("reactor", "SpentFuelSink"), #("enrichment", "HEUSink")
           ('civ_enrichment', 'civ_str_u_dep'), ('mine', 'milling'), ('milling', 'conversion'), 
           ('civ_enrichment', 'civ_fabrication'), ('conversion', 'civ_enrichment')
    }
    sorted_trans = sorted(transactions)
    for t in range(max_time):
        #subset rows with this timestep
        subset = truckdf.loc[truckdf['Time'] == t]
        sub_row = [0] * len(transactions)
        
        for index, row in subset.iterrows():
            #check each possible transaction
            for t in range(len(transactions)): 
                if row["SenderId"] == sorted_trans[t][0] and row["ReceiverId"] == sorted_trans[t][1]:    
                    sub_row[t] = truckdf.loc[index, "truck"]
#             if row["SenderId"] == "UraniumMine" and row["ReceiverId"] == "enrichment":
#                 #sub_row[0] = ("mine--enrich") #switch to the number of trucks sent!
#                 sub_row[0] = truckdf.loc[index, "truck"]
                
#             if row["SenderId"] == "enrichment" and row["ReceiverId"] == "reactor":
#                 sub_row[1] = truckdf.loc[index, "truck"]

#             if row["SenderId"] == "enrichment" and row["ReceiverId"] == "HEUSink":
#                 sub_row[2] = truckdf.loc[index, "truck"]

#             if row["SenderId"] == "reactor" and row["ReceiverId"] == "SpentFuelSink":
#                 sub_row[3] = truckdf.loc[index, "truck"]
        long_row.extend(sub_row)
    return long_row

In [42]:
def file_to_line(filename, truck_size, max_time):
    return make_row(send_trucks(trim_data(tidy_df(filename)), truck_size), max_time)
    
def simulation_data(files, truck_size, max_time):
    columns = make_cols(max_time)
    print(len(columns))
    rows = []
    for file in files: 
        
        rows.append(file_to_line(file, truck_size, max_time))
#         row = file_to_line(file, truck_size, max_time)
#         rows.append(pd.concat([pd.DataFrame(data=row, columns=columns), isotope_signal()], axis=1))
    
    isotope_rows = []
    for file in files: 
        print(file)
        isotopes = isotope_signal(file)
        isotope_rows.append(isotopes.iloc[0].values.tolist())
        if file == files[0]: 
            isotope_columns = isotopes.columns.to_list()
        
    return pd.concat([pd.DataFrame(data = rows, columns = columns), pd.DataFrame(data=isotope_rows, columns=isotope_columns)], axis=1)
    #return pd.DataFrame(data = rows, columns = columns)
    #return pd.concat([pd.DataFrame(data = rows, columns = columns), isotope_signal()], axis=1)


In [46]:
simulation_data(["out-run0.py-.sqlite", "out-run1.py-.sqlite"], 12000, 144)

721
there was an agent issue with out-run0.py-.sqlite
there was an transactions issue with out-run0.py-.sqlite
there was a resources issue with out-run0.py-.sqlite


UnboundLocalError: local variable 'transactions' referenced before assignment

In [10]:
#big_df = simulation_data(files[0:], 20000, 144)
#small_df = simulation_data(files[0:20], 20000, 144)
#med_df = simulation_data(files[0:100], 20000, 144)
#tweak = simulation_data(files[0:], 20000, 144)
#simple = simulation_data(files[0:], 20, 144)

import os
files = []
for file in os.listdir("swu_cycle_variance"):
    if file.endswith(".sqlite"):
        #print(file)
        #files.append(os.path.join("swu_cycle_variance", file))
        files.append(os.path.join("swu_cycle_variance", file))

print(files[0:10])
big_df = simulation_data(files[0:], 20000, 144)
#smaller_trucks = simulation_data(files[0:], 10000, 144)

['swu_cycle_variance/out-run582.py-.sqlite', 'swu_cycle_variance/out-run853.py-.sqlite', 'swu_cycle_variance/out-run497.py-.sqlite', 'swu_cycle_variance/out-run233.py-.sqlite', 'swu_cycle_variance/out-run275.py-.sqlite', 'swu_cycle_variance/out-run358.py-.sqlite', 'swu_cycle_variance/out-run700.py-.sqlite', 'swu_cycle_variance/out-run357.py-.sqlite', 'swu_cycle_variance/out-run612.py-.sqlite', 'swu_cycle_variance/out-run458.py-.sqlite']
1585
          SenderId       ReceiverId  ResourceId Commodity  Time      Quantity
0             mine          milling           6     u_ore     1  2.000000e+08
1          milling       conversion          10     u_nat     2  1.500000e+05
2             mine          milling          19     u_ore     2  1.515152e+08
3          milling       conversion          24     u_nat     3  1.500000e+05
4             mine          milling          37     u_ore     3  1.851749e+08
5          milling       conversion          59     u_nat     4  1.020202e+05
6       

          SenderId       ReceiverId  ResourceId Commodity  Time      Quantity
0             mine          milling           6     u_ore     1  2.000000e+08
1          milling       conversion          10     u_nat     2  1.500000e+05
2             mine          milling          19     u_ore     2  1.515152e+08
3          milling       conversion          24     u_nat     3  1.500000e+05
4             mine          milling          37     u_ore     3  1.851749e+08
5          milling       conversion          59     u_nat     4  1.020202e+05
6             mine          milling          61     u_ore     4  1.627558e+08
7             mine          milling          82     u_ore     5  1.215940e+08
8       conversion   civ_enrichment        1713       uf6   121  1.000000e+05
9          milling       conversion        1736     u_nat   122  1.010101e+05
10  civ_enrichment  civ_fabrication        1739   civ_leu   122  8.723404e+03
11            mine          milling        1762     u_ore   123 

           SenderId      ReceiverId  ResourceId  Commodity  Time      Quantity
0              mine         milling           6      u_ore     1  2.000000e+08
1           milling      conversion          10      u_nat     2  1.500000e+05
2              mine         milling          19      u_ore     2  1.515152e+08
3           milling      conversion          24      u_nat     3  1.500000e+05
4              mine         milling          37      u_ore     3  1.851749e+08
..              ...             ...         ...        ...   ...           ...
250      conversion  mil_enrichment        3289        uf6   142  2.000000e+04
251            mine         milling        3291      u_ore   142  3.624782e+07
252  mil_enrichment   mil_str_u_dep        3262  mil_u_dep   142  1.990858e+04
253         milling      conversion        3316      u_nat   143  2.020202e+04
254  mil_enrichment    mil_str_fiss        3319   mil_fiss   143  9.141583e+01

[255 rows x 6 columns]
Counter({('mil_enrichment', 

           SenderId      ReceiverId  ResourceId  Commodity  Time      Quantity
0              mine         milling           6      u_ore     1  2.000000e+08
1           milling      conversion          10      u_nat     2  1.500000e+05
2              mine         milling          19      u_ore     2  1.515152e+08
3           milling      conversion          24      u_nat     3  1.500000e+05
4              mine         milling          37      u_ore     3  1.851749e+08
..              ...             ...         ...        ...   ...           ...
447         milling      conversion        3992      u_nat   143  1.517441e+04
448            mine         milling        3994      u_ore   143  9.304272e+06
449      conversion  mil_enrichment        3997        uf6   143  4.977331e+03
450  mil_enrichment   mil_str_u_dep        3962  mil_u_dep   143  4.954580e+03
451  mil_enrichment    mil_str_fiss        4000   mil_fiss   143  6.866549e+01

[452 rows x 6 columns]
Counter({('conversion', 'mil

          SenderId       ReceiverId  ResourceId Commodity  Time      Quantity
0             mine          milling           6     u_ore     1  2.000000e+08
1             mine          milling          19     u_ore     2  1.515152e+08
2          milling       conversion          10     u_nat     2  1.500000e+05
3             mine          milling          37     u_ore     3  1.851749e+08
4          milling       conversion          24     u_nat     3  1.500000e+05
5             mine          milling          58     u_ore     4  1.627558e+08
6          milling       conversion          61     u_nat     4  1.020202e+05
7             mine          milling          82     u_ore     5  1.215940e+08
8       conversion   civ_enrichment        1713       uf6   121  1.000000e+05
9          milling       conversion        1736     u_nat   122  1.010101e+05
10  civ_enrichment  civ_fabrication        1739   civ_leu   122  8.723404e+03
11            mine          milling        1762     u_ore   123 

           SenderId      ReceiverId  ResourceId  Commodity  Time      Quantity
0              mine         milling           6      u_ore     1  2.000000e+08
1           milling      conversion          10      u_nat     2  1.500000e+05
2              mine         milling          19      u_ore     2  1.515152e+08
3           milling      conversion          24      u_nat     3  1.500000e+05
4              mine         milling          37      u_ore     3  1.851749e+08
..              ...             ...         ...        ...   ...           ...
248            mine         milling        3300      u_ore   142  3.624782e+07
249      conversion  mil_enrichment        3303        uf6   142  2.000000e+04
250  mil_enrichment   mil_str_u_dep        3274  mil_u_dep   142  1.990858e+04
251         milling      conversion        3328      u_nat   143  2.020202e+04
252  mil_enrichment    mil_str_fiss        3331   mil_fiss   143  9.141583e+01

[253 rows x 6 columns]
Counter({('mil_enrichment', 

          SenderId       ReceiverId  ResourceId Commodity  Time      Quantity
0             mine          milling           6     u_ore     1  2.000000e+08
1          milling       conversion          10     u_nat     2  1.500000e+05
2             mine          milling          19     u_ore     2  1.515152e+08
3          milling       conversion          24     u_nat     3  1.500000e+05
4             mine          milling          37     u_ore     3  1.851749e+08
5          milling       conversion          59     u_nat     4  1.020202e+05
6             mine          milling          61     u_ore     4  1.627558e+08
7             mine          milling          82     u_ore     5  1.215940e+08
8       conversion   civ_enrichment        1713       uf6   121  1.000000e+05
9          milling       conversion          58     u_nat   122  4.797980e+04
10         milling       conversion        1735     u_nat   122  5.303030e+04
11  civ_enrichment  civ_fabrication        1741   civ_leu   122 

           SenderId      ReceiverId  ResourceId  Commodity  Time      Quantity
0              mine         milling           6      u_ore     1  2.000000e+08
1              mine         milling          19      u_ore     2  1.515152e+08
2           milling      conversion          10      u_nat     2  1.500000e+05
3              mine         milling          37      u_ore     3  1.851749e+08
4           milling      conversion          24      u_nat     3  1.500000e+05
..              ...             ...         ...        ...   ...           ...
472            mine         milling        4402      u_ore   143  1.085856e+07
473         milling      conversion        4405      u_nat   143  5.937521e+03
474      conversion  mil_enrichment        4408        uf6   143  5.878146e+03
475  mil_enrichment   mil_str_u_dep        4372  mil_u_dep   143  5.851278e+03
476  mil_enrichment    mil_str_fiss        4414   mil_fiss   143  2.686778e+01

[477 rows x 6 columns]
Counter({('mine', 'milling')

          SenderId       ReceiverId  ResourceId Commodity  Time      Quantity
0             mine          milling           6     u_ore     1  2.000000e+08
1          milling       conversion          10     u_nat     2  1.500000e+05
2             mine          milling          19     u_ore     2  1.515152e+08
3          milling       conversion          24     u_nat     3  1.500000e+05
4             mine          milling          37     u_ore     3  1.851749e+08
5          milling       conversion          59     u_nat     4  1.020202e+05
6             mine          milling          61     u_ore     4  1.627558e+08
7             mine          milling          82     u_ore     5  1.215940e+08
8       conversion   civ_enrichment        1713       uf6   121  1.000000e+05
9          milling       conversion        1736     u_nat   122  1.010101e+05
10  civ_enrichment  civ_fabrication        1739   civ_leu   122  8.723404e+03
11            mine          milling        1762     u_ore   123 

           SenderId      ReceiverId  ResourceId  Commodity  Time      Quantity
0              mine         milling           6      u_ore     1  2.000000e+08
1              mine         milling          19      u_ore     2  1.515152e+08
2           milling      conversion          10      u_nat     2  1.500000e+05
3              mine         milling          37      u_ore     3  1.851749e+08
4           milling      conversion          24      u_nat     3  1.500000e+05
..              ...             ...         ...        ...   ...           ...
473            mine         milling        4501      u_ore   143  7.219273e+06
474         milling      conversion        4504      u_nat   143  4.036774e+03
475      conversion  mil_enrichment        4507        uf6   143  3.996406e+03
476  mil_enrichment   mil_str_u_dep        4471  mil_u_dep   143  3.978140e+03
477  mil_enrichment    mil_str_fiss        4513   mil_fiss   143  1.826674e+01

[478 rows x 6 columns]
Counter({('conversion', 'mil

          SenderId       ReceiverId  ResourceId Commodity  Time      Quantity
0             mine          milling           6     u_ore     1  2.000000e+08
1          milling       conversion          10     u_nat     2  1.500000e+05
2             mine          milling          19     u_ore     2  1.515152e+08
3          milling       conversion          24     u_nat     3  1.500000e+05
4             mine          milling          37     u_ore     3  1.851749e+08
5          milling       conversion          59     u_nat     4  1.020202e+05
6             mine          milling          61     u_ore     4  1.627558e+08
7             mine          milling          82     u_ore     5  1.215940e+08
8       conversion   civ_enrichment        1713       uf6   121  1.000000e+05
9          milling       conversion          58     u_nat   122  4.797980e+04
10         milling       conversion        1735     u_nat   122  5.303030e+04
11  civ_enrichment  civ_fabrication        1738   civ_leu   122 

KeyboardInterrupt: 

In [None]:
print(big_df.shape)
big_df.describe()


In [None]:
big_df.isnull().sum().sum()

## Statistics

In [None]:
simple.fillna(value = 0, axis=1, inplace = True)
simple.head()

In [None]:
print(sum(simple["diversion"]))
simple.describe()

#simple.isnull().sum().sum()

In [None]:
from sklearn.model_selection import train_test_split
#split into training and test sets
X = simple.loc[:, simple.columns != "diversion"]
y = simple["diversion"]
trucks_train, trucks_test, diversion_train, diversion_test = train_test_split(X, y, test_size=0.3)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
s_trucks_train, s_trucks_test, diversion_train, diversion_test = train_test_split(X_scaled, y, test_size=0.3)

In [None]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=8)
rf.fit(s_trucks_train, diversion_train)

rfy_pred = rf.predict(s_trucks_test)

In [None]:
from sklearn import metrics
#check performance
print("Random forest accuracy:",metrics.accuracy_score(diversion_test, rfy_pred))

features = pd.Series(rf.feature_importances_,index=simple.columns[1:]).sort_values(ascending=False)
#print([f for f in features])

In [None]:
from sklearn.svm import LinearSVC

linsvc = LinearSVC(C=1)
linsvc.fit(s_trucks_train, diversion_train)
linsvy_pred = linsvc.predict(s_trucks_test)

print("Linear SVC accuracy:", metrics.accuracy_score(diversion_test, linsvy_pred))

from sklearn import svm

svc = svm.SVC()
svc.fit(s_trucks_train, diversion_train)
svc_yhat = svc.predict(s_trucks_test)

print("SVC accuracy:", metrics.accuracy_score(diversion_test, svc_yhat))

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = "sag")
lrmod = lr.fit(s_trucks_train, diversion_train)
lryhat = lr.predict(s_trucks_test)

print("Logistic regression accuracy:", metrics.accuracy_score(diversion_test, lryhat))

In [None]:
from sklearn import neighbors 
knn = neighbors.KNeighborsClassifier(3, weights = 'uniform')   
model = knn.fit(s_trucks_train, diversion_train)
print(model.score(s_trucks_train, diversion_train))

knnhat = model.predict(s_trucks_test)
print("KNN test accuracy:", metrics.accuracy_score(diversion_test, knnhat))

In [None]:
#what if it's raining? remove 30% of the data points 
X = simple.loc[:, simple.columns != "diversion"]
y = simple["diversion"]

#go through each column, select 30% of rows in that column, change value to 0
def mask(df, proportion):
    new = df.copy()
    for c in range(new.shape[1]):
        row_ids = random.sample(range(new.shape[0]), int(new.shape[0] * proportion))
        for row in row_ids: 
            new.iat[row, c] = 0 
    return new

X30 = mask(X, 0.3)
print(X30.equals(X))


trucks_train, trucks_test, diversion_train, diversion_test = train_test_split(X30, y, test_size=0.3)

scaler = StandardScaler()
X30_scaled = scaler.fit_transform(X30)
sn_trucks_train, sn_trucks_test, diversion_train, diversion_test = train_test_split(X30_scaled, y, test_size=0.3)

In [None]:
missing = []
for p in range(1, 8): 
    missing.append(mask(X, p * 0.1))

In [None]:
missing[0].equals(missing[1])

In [None]:
scaler = StandardScaler()
train_test = []
scaled = []
for m in missing: 
    train_test.append(train_test_split(m, simple["diversion"], test_size=0.3))
    mscaled = scaler.fit_transform(m)
    scaled.append(train_test_split(mscaled, simple["diversion"], test_size = 0.3))
  
    

In [None]:
X30.head()

In [None]:
svc = svm.SVC()
svc.fit(sn_trucks_train, diversion_train)
svc_yhat = svc.predict(sn_trucks_test)

print("SVC accuracy:", metrics.accuracy_score(diversion_test, svc_yhat))

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(trucks_train, diversion_train)

rfy_pred = rf.predict(trucks_test)
print("Random forest accuracy:",metrics.accuracy_score(diversion_test, rfy_pred))
print("Random forest auc:",metrics.roc_auc_score(diversion_test, rfy_pred))
print("Random forest confusion matrix:", metrics.confusion_matrix(diversion_test, rfy_pred))

In [None]:
knn = neighbors.KNeighborsClassifier(3, weights = 'uniform')   
model = knn.fit(sn_trucks_train, diversion_train)
print(model.score(sn_trucks_train, diversion_train))

knnhat = model.predict(sn_trucks_test)
print("KNN test accuracy:", metrics.accuracy_score(diversion_test, knnhat))
print("KNN auc:", metrics.roc_auc_score(diversion_test, knnhat))

In [None]:
rf_accuracy = []
rf_auc = []
for trucks_train, trucks_test, diversion_train, diversion_test in train_test: 
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(trucks_train, diversion_train)

    rfy_pred = rf.predict(trucks_test)
    rf_accuracy.append(metrics.accuracy_score(diversion_test, rfy_pred))
    rf_auc.append(metrics.roc_auc_score(diversion_test, rfy_pred))
    print("Random forest accuracy:",metrics.accuracy_score(diversion_test, rfy_pred))
    print("Random forest auc:",metrics.roc_auc_score(diversion_test, rfy_pred))
    print("Random forest confusion matrix:", metrics.confusion_matrix(diversion_test, rfy_pred))
    print()
    
print(rf_accuracy)
print(rf_auc)

In [None]:
svc_accuracy = []
svc_auc = []
knn_accuracy = []
knn_auc = []
for s_trucks_train, s_trucks_test, diversion_train, diversion_test in scaled:
    svc = svm.SVC()
    svc.fit(s_trucks_train, diversion_train)
    svc_yhat = svc.predict(s_trucks_test)
    svc_accuracy.append(metrics.accuracy_score(diversion_test, svc_yhat))
    svc_auc.append(metrics.roc_auc_score(diversion_test, svc_yhat))
    print("SVC accuracy:", metrics.accuracy_score(diversion_test, svc_yhat))
    print("SVC auc:",metrics.roc_auc_score(diversion_test, svc_yhat))
    print("SVC confusion matrix:", metrics.confusion_matrix(diversion_test, svc_yhat))
    print()
    
    knn = neighbors.KNeighborsClassifier(3, weights = 'uniform')   
    model = knn.fit(s_trucks_train, diversion_train)
    print(model.score(s_trucks_train, diversion_train))

    knnhat = model.predict(s_trucks_test)
    knn_accuracy.append(metrics.accuracy_score(diversion_test, knnhat))
    knn_auc.append(metrics.roc_auc_score(diversion_test, knnhat))
    print("KNN test accuracy:", metrics.accuracy_score(diversion_test, knnhat))
    print("KNN auc:",metrics.roc_auc_score(diversion_test, knnhat))
    print("KNN confusion matrix:", metrics.confusion_matrix(diversion_test, knnhat))
    print()

In [None]:
#for anna's model: 

print(svc_accuracy)


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))

#percent of data missing from the set on x axis, accuracy on y axis
plt.scatter([10, 20, 30, 40, 50, 60, 70], rf_accuracy, marker='o', color='b', alpha=0.7,
            s = 124, label='Random Forest Classifier')

plt.scatter([10, 20, 30, 40, 50, 60, 70], svc_accuracy, marker='o', color='r', alpha=0.7, 
            s = 124, label='Support Vector Classifier')

plt.scatter([10, 20, 30, 40, 50, 60, 70], knn_accuracy, marker='o', color='g', alpha=0.7, 
            s = 124, label='K-Nearest Neighbors Classifier')
plt.title("Accuracy of Classifiers for Schema A")
plt.xlabel("Percent of Data Excluded from Model")
plt.ylabel("Accuracy")
plt.xlim(0, 80)
plt.ylim(0.8, 1.05)
plt.legend(loc='lower left')
plt.show()

In [None]:

plt.figure(figsize=(10,8))

#percent of data missing from the set on x axis, accuracy on y axis
plt.scatter([10, 20, 30, 40, 50, 60, 70], rf_auc, marker='o', color='b', alpha=0.7,
            s = 124, label='Random Forest AUC')

plt.scatter([10, 20, 30, 40, 50, 60, 70], svc_auc, marker='o', color='r', alpha=0.7, 
            s = 124, label='SVC AUC')

plt.scatter([10, 20, 30, 40, 50, 60, 70], knn_auc, marker='o', color='g', alpha=0.7, 
            s = 124, label='KNN AUC')
plt.title("AUC of Classifiers for Schema A")
plt.xlabel("Percent of Data Excluded from Model")
plt.ylabel("AUC")
plt.xlim(0, 80)
plt.ylim(0.8, 1.05)
plt.legend(loc='lower left')
plt.show()

In [None]:
#for baptiste's model: 
#use big_df
X = big_df.loc[:, big_df.columns != "diversion"]
y = big_df["diversion"]
trucks_train, trucks_test, diversion_train, diversion_test = train_test_split(X, y, test_size=0.3)

missing = []
for p in range(1, 8): 
    missing.append(mask(X, p * 0.1))

In [None]:
scaler = StandardScaler()
train_test = []
scaled = []
for m in missing: 
    train_test.append(train_test_split(m, big_df["diversion"], test_size=0.3))
    mscaled = scaler.fit_transform(m)
    scaled.append(train_test_split(mscaled, big_df["diversion"], test_size = 0.3))
  

In [None]:
rf_accuracy = []
rf_auc = []
for trucks_train, trucks_test, diversion_train, diversion_test in train_test: 
    rf = RandomForestClassifier(n_estimators=3)
    rf.fit(trucks_train, diversion_train)

    rfy_pred = rf.predict(trucks_test)
    rf_accuracy.append(metrics.accuracy_score(diversion_test, rfy_pred))
    rf_auc.append(metrics.roc_auc_score(diversion_test, rfy_pred))
    print("Random forest accuracy:",metrics.accuracy_score(diversion_test, rfy_pred))
    print("Random forest auc:",metrics.roc_auc_score(diversion_test, rfy_pred))
    print("Random forest confusion matrix:", metrics.confusion_matrix(diversion_test, rfy_pred))
    print()
    
print(rf_accuracy)
print(rf_auc)

In [None]:
svc_accuracy = []
svc_auc = []
knn_accuracy = []
knn_auc = []
for s_trucks_train, s_trucks_test, diversion_train, diversion_test in scaled:
    svc = svm.SVC()
    svc.fit(s_trucks_train, diversion_train)
    svc_yhat = svc.predict(s_trucks_test)
    svc_accuracy.append(metrics.accuracy_score(diversion_test, svc_yhat))
    svc_auc.append(metrics.roc_auc_score(diversion_test, svc_yhat))
    print("SVC accuracy:", metrics.accuracy_score(diversion_test, svc_yhat))
    print("SVC auc:",metrics.roc_auc_score(diversion_test, svc_yhat))
    print("SVC confusion matrix:", metrics.confusion_matrix(diversion_test, svc_yhat))
    print()
    
    knn = neighbors.KNeighborsClassifier(3, weights = 'uniform')   
    model = knn.fit(s_trucks_train, diversion_train)
    print(model.score(s_trucks_train, diversion_train))

    knnhat = model.predict(s_trucks_test)
    knn_accuracy.append(metrics.accuracy_score(diversion_test, knnhat))
    knn_auc.append(metrics.roc_auc_score(diversion_test, knnhat))
    print("KNN test accuracy:", metrics.accuracy_score(diversion_test, knnhat))
    print("KNN auc:",metrics.roc_auc_score(diversion_test, knnhat))
    print("KNN confusion matrix:", metrics.confusion_matrix(diversion_test, knnhat))
    print()

In [None]:
print(knn_accuracy)
print(svc_accuracy)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))

#percent of data missing from the set on x axis, accuracy on y axis
plt.scatter([10, 20, 30, 40, 50, 60, 70], rf_accuracy, marker='o', color='b', alpha=0.7,
            s = 124, label='Random Forest Classifier')

plt.scatter([10, 20, 30, 40, 50, 60, 70], svc_accuracy, marker='o', color='r', alpha=0.7, 
            s = 124, label='Support Vector Classifier')

plt.scatter([10, 20, 30, 40, 50, 60, 70], knn_accuracy, marker='o', color='g', alpha=0.7, 
            s = 124, label='K-Nearest Neighbors Classifier')
plt.title("Accuracy of Classifiers for Schema B")
plt.xlabel("Percent of Data Excluded from Model")
plt.ylabel("Accuracy")
plt.xlim(0, 80)
plt.ylim(0.75, 1.05)
plt.legend(loc='lower left')
plt.show()

In [None]:
plt.figure(figsize=(10,8))

#percent of data missing from the set on x axis, accuracy on y axis
plt.scatter([10, 20, 30, 40, 50, 60, 70], rf_auc, marker='o', color='b', alpha=0.7,
            s = 124, label='Random Forest AUC')

plt.scatter([10, 20, 30, 40, 50, 60, 70], svc_auc, marker='o', color='r', alpha=0.7, 
            s = 124, label='SVC AUC')

plt.scatter([10, 20, 30, 40, 50, 60, 70], knn_auc, marker='o', color='g', alpha=0.7, 
            s = 124, label='KNN AUC')
plt.title("AUC of Classifiers for Schema B")
plt.xlabel("Percent of Data Excluded from Model")
plt.ylabel("AUC")
plt.xlim(0, 80)
plt.ylim(0.75, 1.05)
plt.legend(loc='lower left')
plt.show()