# Processing a dataset on commercial nuclear reactors

Obtained from https://catalog.data.gov/dataset/us-commercial-nuclear-power-reactors.

In [67]:
import csv
import os
import ast
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import IPython
%matplotlib inline

### 1. Importing the data

*Note that I manually dropped the first row, which was just a title for the whole document. This allowed me to use the given row names on row 2.*

In [76]:
df = pd.read_excel("/Users/bracho/Downloads/reactors-operating.xls")
df.head()

Unnamed: 0,"Plant Name, Unit Number",NRC Reactor Unit Web Page,Docket Number,License Number,Location,NRC Region,Licensee,Parent Company Utility Name,Parent Company Website,Parent Company Notes,...,2010 Capacity Factor (Percent),2009 Capacity Factor (Percent),2008 Capacity Factor (Percent),Unnamed: 32,2005 Capacity Factor (Percent),2004 Capacity Factor (Percent),2003 Capacity Factor (Percent),Unnamed: 36,Years of Operation through 12/31/2019,Unnamed: 38
0,"Arkansas Nuclear One, Unit 1",ANO 1,5000313,DPR-51,"London, AR (6 MI WNW of Russellville, AR)",4,"Entergy Operations, Inc.","Entergy Nuclear Operations, Inc.",www.entergy-nuclear.com,,...,0.9,0.99,0.83,,0.78,0.92,0.92,,45,NaT
1,"Arkansas Nuclear One, Unit 2",ANO 2,5000368,NPF-6,"London, AR (6 MI WNW of Russellville, AR)",4,"Entergy Operations, Inc.","Entergy Nuclear Operations, Inc.",www.entergy-nuclear.com,,...,0.97,0.9,0.911003,,0.91,0.99,0.9,,41,2019-12-31
2,"Beaver Valley Power Station, Unit 1",Beaver Valley 1,5000334,DPR-66,"Shippingport, PA(17 MI W of McCandless, PA)",1,FirstEnergy Nuclear Operating Co.,FirstEnergy Nuclear Operating Company,www.firstenergycorp.com,,...,0.91,0.92,1.014076,,1.01,0.93,0.83,,43,NaT
3,"Beaver Valley Power Station, Unit 2",Beaver Valley 2,5000412,NPF-73,"Shippingport, PA (17 MI W of McCandless, PA)",1,FirstEnergy Nuclear Operating Co.,FirstEnergy Nuclear Operating Company,www.firstenergycorp.com,,...,0.84,0.87,1.03,,0.93,1.0,0.91,,32,NaT
4,"Braidwood Station, Unit 1",Braidwood 1,5000456,NPF-72,"Braceville, IL (20 MI SSW of Joliet, IL)",3,"Exelon Generation Co., LLC","Exelon Corporation, LLC",www.exeloncorp.com,,...,0.89,0.95,1.013985,,1.0,0.95,0.97,,32,NaT


### 2. Data processing

#### 2.1 Fixing header titles

Inside some of the column names, there are new lines and other unused columns. Thus, we can remove unnecessary columns and fix the names to be more computer-friendly. In addition, because there are many unused columns with "Unnamed" in their title, we can remove them along with their data.

In [77]:
headers = [s.replace("\n", " ") for s in list(df)] # Get headers and replace \n
df = df.rename(columns=dict(zip(list(df), headers))) # Rename headers in df
del_col = [s for s in list(df) if "Unnamed" in s] # Find headers with "Unnamed" in name
df = df.drop(del_col, axis=1) # Remove headers in del_col
df.head()

Unnamed: 0,"Plant Name, Unit Number",NRC Reactor Unit Web Page,Docket Number,License Number,Location,NRC Region,Licensee,Parent Company Utility Name,Parent Company Website,Parent Company Notes,...,2013 Capacity Factor (Percent),2012 Capacity Factor (Percent),2011 Capacity Factor (Percent),2010 Capacity Factor (Percent),2009 Capacity Factor (Percent),2008 Capacity Factor (Percent),2005 Capacity Factor (Percent),2004 Capacity Factor (Percent),2003 Capacity Factor (Percent),Years of Operation through 12/31/2019
0,"Arkansas Nuclear One, Unit 1",ANO 1,5000313,DPR-51,"London, AR (6 MI WNW of Russellville, AR)",4,"Entergy Operations, Inc.","Entergy Nuclear Operations, Inc.",www.entergy-nuclear.com,,...,0.560015,1.02,0.87,0.9,0.99,0.83,0.78,0.92,0.92,45
1,"Arkansas Nuclear One, Unit 2",ANO 2,5000368,NPF-6,"London, AR (6 MI WNW of Russellville, AR)",4,"Entergy Operations, Inc.","Entergy Nuclear Operations, Inc.",www.entergy-nuclear.com,,...,0.909777,0.93,0.9,0.97,0.9,0.911003,0.91,0.99,0.9,41
2,"Beaver Valley Power Station, Unit 1",Beaver Valley 1,5000334,DPR-66,"Shippingport, PA(17 MI W of McCandless, PA)",1,FirstEnergy Nuclear Operating Co.,FirstEnergy Nuclear Operating Company,www.firstenergycorp.com,,...,0.860891,0.92,1.01,0.91,0.92,1.014076,1.01,0.93,0.83,43
3,"Beaver Valley Power Station, Unit 2",Beaver Valley 2,5000412,NPF-73,"Shippingport, PA (17 MI W of McCandless, PA)",1,FirstEnergy Nuclear Operating Co.,FirstEnergy Nuclear Operating Company,www.firstenergycorp.com,,...,0.966991,0.91,1.02,0.84,0.87,1.03,0.93,1.0,0.91,32
4,"Braidwood Station, Unit 1",Braidwood 1,5000456,NPF-72,"Braceville, IL (20 MI SSW of Joliet, IL)",3,"Exelon Generation Co., LLC","Exelon Corporation, LLC",www.exeloncorp.com,,...,0.95,0.91,1.01,0.89,0.95,1.013985,1.0,0.95,0.97,32


#### 2.2 Converting NaN/NaT values

#### 2.3 Converting date formats

In order to convert the dates in the data into interval form, we calculate date as the number of days since Jan. 1, 1900. This allows the date to be represented as an integer and possibly inputted into models as is.

In [98]:
date_cols = ["Construction Permit Issued", "Operating License Issued", "Commercial Operation", "Renewed Operating License Issued", "Operating License Expires"]
for dc in date_cols:
    repl = []
    for elem in df[dc]:
        try:
            repl.append((pd.Timestamp(elem) - np.datetime64('1900-01-01')).days)
        except:
            repl.append(np.nan)
    df[dc] = repl