In [1]:
import pandas as pd
import numpy as np
import datetime
from itertools import cycle
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [34]:
clusterID = np.load('/content/drive/MyDrive/Colab Notebooks/clusterID_genotype.npy')
test_weather = np.load('/content/drive/MyDrive/Colab Notebooks/inputs_weather_test.npy')
test_traits = np.load('/content/drive/MyDrive/Colab Notebooks/inputs_others_test.npy')


## Organizing Weather Data

In [35]:
perf_records, days, variables = test_weather.shape

In [36]:
out_arr = np.column_stack((np.repeat(np.arange(perf_records),days),test_weather.reshape(perf_records*days,-1)))
out_df = pd.DataFrame(out_arr)

In [37]:
# Building a data frame based on input weather data, with columns named for the variables
out_df = pd.DataFrame(out_arr, columns=['Performance Record','ADNI', 'AP', 'ARH', 'MDNI', 'MaxSur', 'MinSur', 'AvgSur'])

In [38]:
out_df['Performance Record'] = pd.to_numeric(out_df['Performance Record']).astype(int)

In [39]:
## Counts all of the days in Performance Record
out_df['Day'] = out_df.groupby('Performance Record').cumcount() + 1

In [40]:
out_df['Julian_date'] = out_df.groupby('Performance Record').cumcount() + 91

In [41]:
def jdtodatestd (jdate):
    fmt = '%j'
    datestd = datetime.datetime.strptime(jdate, fmt).date()
    return(datestd)

In [42]:
subset = out_df[out_df['Performance Record'] == 0]

In [43]:
date_list = []
for i, row in subset.iterrows():
    julian = int(row['Julian_date'])
    date = datetime.datetime.strptime(f'{julian}', '%j').date()
    clean_date = date.strftime("%m-%d")
    date_list.append(clean_date)

In [44]:
date_cycle = cycle(date_list)
out_df['date'] = [next(date_cycle) for cycle in range(len(out_df))]

In [45]:
out_df

Unnamed: 0,Performance Record,ADNI,AP,ARH,MDNI,MaxSur,MinSur,AvgSur,Day,Julian_date,date
0,0,335.75,0.00,75.67,932.0,80.8,59.2,67.330,1,91,04-01
1,0,20.75,0.00,66.67,257.0,74.1,62.2,66.020,2,92,04-02
2,0,300.04,0.02,77.67,978.0,60.7,41.5,52.010,3,93,04-03
3,0,77.25,0.00,76.04,380.0,67.6,45.5,54.000,4,94,04-04
4,0,211.50,0.02,84.17,951.0,66.1,43.5,54.430,5,95,04-05
...,...,...,...,...,...,...,...,...,...,...,...
2212113,10336,35.67,0.00,75.46,403.0,43.2,35.1,38.950,210,300,10-27
2212114,10336,182.67,0.00,73.58,653.0,37.4,28.8,33.380,211,301,10-28
2212115,10336,33.04,0.00,65.46,344.0,37.2,28.0,31.030,212,302,10-29
2212116,10336,0.00,0.00,83.83,0.0,37.6,33.0,34.165,213,303,10-30


## Other Trait Data

In [55]:
# Building a data frame based on input data, with columns named for the variables
test_trait_df = pd.DataFrame(test_traits, columns=['Maturity Group', 'Genotype ID', 'State', 'Year', 'Location'])

In [56]:
test_trait_df['Year'] = pd.to_numeric(test_trait_df['Year'])
test_trait_df['Genotype ID'] = pd.to_numeric(test_trait_df['Genotype ID'])
test_trait_df['Year'] = test_trait_df['Year'].astype(int)
test_trait_df['Genotype ID'] = test_trait_df['Genotype ID'].astype(int)

# Adding Cluster ID to trait data

In [17]:
print(len(clusterID))

5839


In [58]:
cluster_dict = {}
for i in range(1, 5839):
  array_index = i-1
  cID = clusterID[i-1]
  cluster_dict.update({i:cID})

In [59]:
cluster_list = []
for i in test_trait_df.index:
  genotype = test_trait_df.iloc[i]['Genotype ID']
  cluster = cluster_dict.get(genotype)
  cluster_list.append(cluster)
test_trait_df['Cluster'] = cluster_list

# Condensed trait data frame

In [60]:
test_trait_df['Maturity Group'] = pd.to_numeric(test_trait_df['Maturity Group'])
test_trait_df['Maturity Group'] = test_trait_df['Maturity Group'].astype(int)


In [61]:
del test_trait_df['Location']
del test_trait_df['Genotype ID']

In [63]:
test_trait_df = test_trait_df.reset_index().rename(columns={'index':'Performance Record'})

# Merging Datasets

In [65]:
daily_df = out_df
daily_df = daily_df.drop(columns=['Day', 'Julian_date',])
expanded_df = pd.concat([test_trait_df]*214, ignore_index=True)
expanded_df = expanded_df.sort_values(by = ['Performance Record'])
expanded_df

Unnamed: 0,Performance Record,Maturity Group,State,Year,Cluster
0,0,3,"""IA""",2010,18
733927,0,3,"""IA""",2010,18
1591898,0,3,"""IA""",2010,18
2088074,0,3,"""IA""",2010,18
103370,0,3,"""IA""",2010,18
...,...,...,...,...,...
1426505,10336,0,"""ND""",2013,19
1416168,10336,0,"""ND""",2013,19
1405831,10336,0,"""ND""",2013,19
1385157,10336,0,"""ND""",2013,19


In [66]:
expanded_df  = expanded_df.reset_index()
expanded_df

Unnamed: 0,index,Performance Record,Maturity Group,State,Year,Cluster
0,0,0,3,"""IA""",2010,18
1,733927,0,3,"""IA""",2010,18
2,1591898,0,3,"""IA""",2010,18
3,2088074,0,3,"""IA""",2010,18
4,103370,0,3,"""IA""",2010,18
...,...,...,...,...,...,...
2212113,1426505,10336,0,"""ND""",2013,19
2212114,1416168,10336,0,"""ND""",2013,19
2212115,1405831,10336,0,"""ND""",2013,19
2212116,1385157,10336,0,"""ND""",2013,19


In [67]:
daily_df

Unnamed: 0,Performance Record,ADNI,AP,ARH,MDNI,MaxSur,MinSur,AvgSur,date
0,0,335.75,0.00,75.67,932.0,80.8,59.2,67.330,04-01
1,0,20.75,0.00,66.67,257.0,74.1,62.2,66.020,04-02
2,0,300.04,0.02,77.67,978.0,60.7,41.5,52.010,04-03
3,0,77.25,0.00,76.04,380.0,67.6,45.5,54.000,04-04
4,0,211.50,0.02,84.17,951.0,66.1,43.5,54.430,04-05
...,...,...,...,...,...,...,...,...,...
2212113,10336,35.67,0.00,75.46,403.0,43.2,35.1,38.950,10-27
2212114,10336,182.67,0.00,73.58,653.0,37.4,28.8,33.380,10-28
2212115,10336,33.04,0.00,65.46,344.0,37.2,28.0,31.030,10-29
2212116,10336,0.00,0.00,83.83,0.0,37.6,33.0,34.165,10-30


In [69]:
daily_df['Maturity Group'] = expanded_df['Maturity Group']
daily_df['State'] = expanded_df['State']
daily_df['Year'] = expanded_df['Year']
daily_df['Cluster'] = expanded_df['Cluster']
final_df = daily_df

In [70]:
final_df

Unnamed: 0,Performance Record,ADNI,AP,ARH,MDNI,MaxSur,MinSur,AvgSur,date,Maturity Group,State,Year,Cluster
0,0,335.75,0.00,75.67,932.0,80.8,59.2,67.330,04-01,3,"""IA""",2010,18
1,0,20.75,0.00,66.67,257.0,74.1,62.2,66.020,04-02,3,"""IA""",2010,18
2,0,300.04,0.02,77.67,978.0,60.7,41.5,52.010,04-03,3,"""IA""",2010,18
3,0,77.25,0.00,76.04,380.0,67.6,45.5,54.000,04-04,3,"""IA""",2010,18
4,0,211.50,0.02,84.17,951.0,66.1,43.5,54.430,04-05,3,"""IA""",2010,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2212113,10336,35.67,0.00,75.46,403.0,43.2,35.1,38.950,10-27,0,"""ND""",2013,19
2212114,10336,182.67,0.00,73.58,653.0,37.4,28.8,33.380,10-28,0,"""ND""",2013,19
2212115,10336,33.04,0.00,65.46,344.0,37.2,28.0,31.030,10-29,0,"""ND""",2013,19
2212116,10336,0.00,0.00,83.83,0.0,37.6,33.0,34.165,10-30,0,"""ND""",2013,19


In [71]:
final_df.to_csv('test_out.csv')