In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import pickle 
from sklearn import preprocessing
import seaborn as sns

## NOAA dataset importing and cleaning 


In [3]:
# Importing data, removing unecessary columns 
raw_data = pd.read_csv("./Data import/dataset.csv")
raw_data = raw_data.drop(['Unnamed: 0', "year", "month", "day",'original_central_meridian_dist', 'original_latitude','integrated_flux', "event_starttime", "event_peaktime", "event_endtime", 'noaa_ar'], axis = 1)
raw_data.matchtime = pd.to_datetime(raw_data.matchtime, format = "%Y%m%d")


In [None]:
raw_data 

In [4]:
## Add Mcintosh evolution 

# Write query function
def query_mcintosh(noaa_ar, date, nhours): 
    try:
        query_date = str(date - pd.DateOffset(hours= nhours))[0:10]
        key = raw_data.query(f"matchtime == '{query_date}' & noaa_ar_no == {noaa_ar}")["mcintosh"].iloc[0]
    except:
        key = "None"
    return key

## Query all the datapoints to find all previous Mcintosh classifications 
mcintosh_evolution = []
for i in range(raw_data.shape[0]): 
    if i%1000 == 0: 
        print(f"finished querying row {i}")
    query_row = raw_data.iloc[i]
    if query_row[["goes_class"]].isna()[0] == True: 
        mcintosh_evolution.append("None")
    else: 
        noaa_ar = query_row.noaa_ar_no
        date = query_row.matchtime
        previous_mcintosh = query_mcintosh(noaa_ar, date, 24)
        mcintosh_evolution.append(previous_mcintosh)

mcintosh_evolution = pd.Series(mcintosh_evolution)        
# Create new column 
raw_data["mcintosh_evolution"] = mcintosh_evolution + "-" + raw_data["mcintosh"]


finished querying row 0
finished querying row 1000
finished querying row 2000
finished querying row 3000
finished querying row 4000
finished querying row 5000
finished querying row 6000
finished querying row 7000
finished querying row 8000
finished querying row 9000
finished querying row 10000
finished querying row 11000
finished querying row 12000
finished querying row 13000
finished querying row 14000
finished querying row 15000
finished querying row 16000
finished querying row 17000
finished querying row 18000
finished querying row 19000
finished querying row 20000
finished querying row 21000
finished querying row 22000


In [5]:
# Get rid of missing data
clean_data = raw_data.dropna(axis = 0, subset=['goes_class']).reset_index()
clean_data = clean_data.query("goes_class_ind != 'A'")
print(np.unique(clean_data.goes_class_ind, return_counts = True))
print(clean_data.shape)
print(np.sum(clean_data.isna()))
clean_data = clean_data.dropna(axis = 0, subset=['corr_whole_spot_area']).reset_index()
print()
print("not a substantial difference in dataset after removing missing data:")
print(np.unique(clean_data.goes_class_ind, return_counts = True))
print(clean_data.shape)
np.sum(clean_data.isna())


(array(['B', 'C', 'M', 'X'], dtype=object), array([3806, 6068,  634,   42], dtype=int64))
(10550, 14)
index                      0
noaa_ar_no                 0
central_meridian_dist      0
latitude                   0
carrington_longitude       2
corr_whole_spot_area     266
mcintosh                   0
LL                       266
number_of_spots          266
greenwich                  0
matchtime                  0
goes_class_ind             0
goes_class                 0
mcintosh_evolution         0
dtype: int64

not a substantial difference in dataset after removing missing data:
(array(['B', 'C', 'M', 'X'], dtype=object), array([3663, 5952,  627,   42], dtype=int64))
(10284, 15)


level_0                  0
index                    0
noaa_ar_no               0
central_meridian_dist    0
latitude                 0
carrington_longitude     0
corr_whole_spot_area     0
mcintosh                 0
LL                       0
number_of_spots          0
greenwich                0
matchtime                0
goes_class_ind           0
goes_class               0
mcintosh_evolution       0
dtype: int64

## NOAA dataset data exploration 
visualisation of the distributions of dependant and independant features in the dataset

In [None]:
# Graphing Mcintosh classifications 
clean_data['mcintosh'].value_counts().head(30).plot(kind='barh', figsize=(20,10), title = "Counts of the 30 most frequent Mcintosh classifications within the dataset")
# Graphing Mcintosh evolutions
clean_data['mcintosh_evolution'].value_counts().head(30).plot(kind='barh', color = "lightblue", figsize=(20,10), title = "Counts of the 30 most frequent Mcintosh evolutions within the dataset")
# Other independant features 
for column in clean_data.columns: 
    sns.displot(data=clean_data, x= column, color = "red")
    
# Solar flare frequency 
clean_data['goes_class_ind'].value_counts().plot(kind='bar', figsize=(10,10), title = "Solar flare frequency within dataset", color = "g")

## NOAA dataset exporting

In [9]:
## Create Mcintosh evolution dataset 
data_evolution = clean_data.drop(["mcintosh"], axis = 1)
data_evolution = pd.concat([data_evolution, pd.get_dummies(data_evolution["greenwich"]), pd.get_dummies(data_evolution["mcintosh_evolution"])], axis = 1)
data_evolution = data_evolution.drop(["mcintosh_evolution", "greenwich"], axis = 1)
# Get rid of hot encodes that are less than 1 in value (basically non existent)
data_evolution.drop([col for col, val in data_evolution.sum().iteritems() if type(val) != str and val < 3], axis=1, inplace=True)

## Create normal dataset
data_1 = clean_data.drop(["mcintosh_evolution"], axis = 1)
data_1 = pd.concat([data_1, pd.get_dummies(data_1["greenwich"]), pd.get_dummies(data_1["mcintosh"])], axis = 1)
data_1 = data_1.drop(["mcintosh", "greenwich"], axis = 1)

In [10]:
## Create C excluded and B excluded flare datasets 
data_c = data_1[(data_1.goes_class_ind != "B") ].copy()
data_b = data_1[data_1.goes_class_ind != "C"].copy()

# Clean one more time before exporting 
for data in [data_1, data_c, data_b, data_evolution]: 
    data.goes_class_ind[data.goes_class_ind == "B"] = "B/C"
    data.goes_class_ind[data.goes_class_ind == "C"] = "B/C"
    data.set_index(data['matchtime'], inplace = True)
    data.drop(['level_0', 'index', 'noaa_ar_no', 'matchtime','goes_class'], axis = 1, inplace = True)

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
data_1.columns

Index(['central_meridian_dist', 'latitude', 'carrington_longitude',
       'corr_whole_spot_area', 'LL', 'number_of_spots', 'goes_class_ind',
       'Alpha', 'Beta', 'Beta-Delta', 'Beta-Gamma', 'Beta-Gamma-Delta', 'Axx',
       'Bxi', 'Bxo', 'Cai', 'Cao', 'Chi', 'Cho', 'Cki', 'Cko', 'Cri', 'Cro',
       'Csi', 'Cso', 'Dac', 'Dai', 'Dao', 'Dhc', 'Dhi', 'Dho', 'Dkc', 'Dki',
       'Dko', 'Dri', 'Dro', 'Dsc', 'Dsi', 'Dso', 'Eac', 'Eai', 'Eao', 'Ehc',
       'Ehi', 'Eho', 'Ekc', 'Eki', 'Eko', 'Eri', 'Ero', 'Esc', 'Esi', 'Eso',
       'Fac', 'Fai', 'Fao', 'Fhc', 'Fhi', 'Fho', 'Fkc', 'Fki', 'Fko', 'Fsc',
       'Fsi', 'Fso', 'Hax', 'Hhx', 'Hkx', 'Hrx', 'Hsx'],
      dtype='object')

In [23]:
## Export data 
for data, name in zip([data_1, data_c, data_b, data_evolution], ["data_all","data_only_c", "data_only_b", "data_evolution"]):
    # Split training and testing into before 2015 and after 2015
    training = data[:"2015-01-01"]
    testing = data["2015-01-01":]
    X_train = np.array(training.drop(["goes_class_ind"], axis = 1))
    X_test = np.array(testing.drop(["goes_class_ind"], axis = 1))
    # Make sure all numerical values are from -1 to 1 
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_train = max_abs_scaler.fit_transform(X_train)
    X_test = max_abs_scaler.fit_transform(X_test)
    # Get feature index and y 
    feature_index = data.drop(["goes_class_ind"], axis = 1).columns 
    y_train = np.array(training.goes_class_ind)
    y_test = np.array(testing.goes_class_ind)


    # Exporting data out 
    export = X_train, X_test, y_train, y_test, feature_index 
    pickle_out = open("clean_data/" + name + ".pickle", "wb")
    pickle.dump(export, pickle_out)
    pickle_out.close()

## SHARP data 

In [None]:
#raw_data = pd.read_csv("./data_import/SHARP_data.csv")
# for 48 hours:
raw_data = pd.read_csv("./data_import/SHARP_data_48.csv")
raw_data = raw_data.drop(['goes_sat', 'goes_channel',"matchtime" ,"index",'integrated_flux', "event_starttime", "event_peaktime", "event_endtime", 'noaa_ar', 'ID', 'Number','AR issue_date'], axis = 1)
raw_data.date = pd.to_datetime(raw_data.date, format = "%Y%m%d")


In [None]:
goes_hierachy_index = {np.unique(raw_data.goes_class )[i]:i for i in range(len(np.unique(raw_data.goes_class )))}
goes_hierachy = []
for i in range(len(raw_data.goes_class)): 
    goes_hierachy.append(goes_hierachy_index[raw_data.goes_class[i]])
raw_data.goes_class = pd.Series(goes_hierachy)

In [None]:
SHARP_data = raw_data.dropna(axis = 0, subset=['MEANJZH']).reset_index()
np.sum(SHARP_data.isna())

In [None]:
SHARP_data = raw_data.dropna(axis = 0, subset=['MEANJZH']).reset_index()
values = {"MEANSHR": np.mean(raw_data["MEANSHR"]), "MEANGAM": np.mean(raw_data["MEANGAM"]), "ERRGAM":  np.mean(raw_data["ERRGAM"])}
SHARP_data.goes_class_ind[SHARP_data.goes_class > 224] = 1
SHARP_data.goes_class_ind[SHARP_data.goes_class <= 224] = 0
SHARP_data = SHARP_data.fillna(values)
SHARP_data = SHARP_data.dropna(axis = 1)
SHARP_data = SHARP_data.replace([np.inf, -np.inf], np.nan)
SHARP_data = SHARP_data.drop(["index", "goes_class"], axis = 1)
SHARP_data = SHARP_data.dropna(axis = 0)


In [None]:
SHARP_data

In [None]:
## Exporting data 
X = np.array(SHARP_data.drop(["date","goes_class_ind"], axis = 1))
feature_index = SHARP_data.drop(["date","goes_class_ind"], axis = 1).columns
y_export = np.array(SHARP_data.goes_class_ind)
scaler = preprocessing.StandardScaler()
X_export = scaler.fit_transform(X)

# Exporting data out 
#name =  "SHARP_NO"
name = "SHARP_48"
for title, data in zip(["X.pickle", "y.pickle", "index.pickle"],[X_export , y_export, feature_index]):
    pickle_out = open("clean_data/" + name + "_" + title, "wb")
    pickle.dump(data, pickle_out)
    pickle_out.close()
    

In [None]:
print(X.shape)
print(len(feature_index))

In [None]:
SHARP_data.drop(["date","goes_class_ind"], axis = 1)

In [None]:
np.sum(np.isinf(np.array(raw_data)))

In [None]:
np.unique(SHARP_data.goes_class_ind, return_counts = True)

In [None]:
np.unique(SHARP_data.goes_class_ind, return_counts = True)