# Objectives

## 1. Critical Month 
- For each of the 10 variables, compute monthly correlations (April–Oct) with yield.
- Rank months by correlation strength.
- Identify 3 most critical months where the variables show the strongest influence on yield.


## 2. Window Analysis
- Apply a 60-day rolling window across the growing season.
- For each window, compute correlations between yield and temperature and precipitation.
- Identify the single window (e.g., June–July) that best predicts yield using temperature and precipitation (use linear regression for prediction).


## 3. Yield–Weather Sensitivity 
- Compare critical windows across counties.
- Highlight whether yield is more sensitive to early-season stress (planting) or mid-season stress (growing).


# Critical Month

- For each of the 10 variables, compute monthly correlations (April–Oct) with yield.
- Rank months by correlation strength.
- Identify 3 most critical months where the variables show the strongest influence on yield.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

data_filepath = '../../Data/'
os.chdir(data_filepath)

In [2]:
avg_features = pd.read_csv("all_feature_data_avg.csv", index_col=0)

yield_pd = avg_features[["yield", "year", "year_index"]].copy()

yield_pd

Unnamed: 0,yield,year,year_index
0,27.0,1980,0
1,28.0,1980,1
2,29.0,1980,2
3,28.0,1980,3
4,24.0,1980,4
...,...,...,...
45333,182.0,2018,874
45334,156.8,2018,875
45335,163.4,2018,876
45336,160.4,2018,877


In [24]:
avg_features

Unnamed: 0,year_index,index,id2,year,yield,tmmx,rmax,vs,sph,srad,vpd,rmin,pr,tmmn,th
0,0,1,112,1980,27.0,302.547299,88.534865,4.169133,0.010406,244.856055,1.477316,34.108888,2.150448,287.698991,182.028998
1,1,2,113,1980,28.0,302.068878,85.865079,3.564974,0.010097,243.893545,1.461499,34.400296,2.342725,287.706825,192.143968
2,2,3,119,1980,29.0,303.100471,91.763880,3.560391,0.011191,251.878185,1.429625,36.004177,2.067676,288.491839,178.083970
3,3,4,120,1980,28.0,302.966961,88.569844,3.964970,0.010669,250.073255,1.484913,35.960727,2.714061,288.009203,192.123030
4,4,5,124,1980,24.0,303.312180,89.555033,3.494477,0.011313,254.001956,1.481304,36.824029,2.435780,289.358179,176.449627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45333,874,54751,3121,2018,182.0,294.953877,95.237614,4.204021,0.009771,220.000207,0.575663,54.411823,4.651962,283.506739,174.472102
45334,875,54752,3122,2018,156.8,294.790673,93.421281,4.147997,0.009020,217.519544,0.671401,50.163649,4.160591,282.975300,182.414943
45335,876,54753,3123,2018,163.4,294.716057,92.840480,4.182206,0.009125,215.686358,0.661735,51.121667,5.036913,283.231696,181.480117
45336,877,54754,3124,2018,160.4,294.695984,90.952795,4.101506,0.009123,218.553170,0.675299,51.020241,4.583146,283.858971,179.765945


In [23]:
data_files = [file for file in os.listdir("ds-capstone-dataset") if file[-4:] != '.csv']

tmmx = np.load(f"ds-capstone-dataset/{data_files[-3]}", allow_pickle=True)
tmmx_pd = pd.DataFrame(tmmx)

tmmx_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,0.000000,0.000000,0.483582,0.092537,1.403731,3.382090,4.229104,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,0.000000,1.035556,9.746667,2.501111,0.000000,2.052222,6.281111,0.000000,0.0,0.0,...,0.000000,0.064444,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,8.483168,2.615842,19.228713,2.899010,0.000000,0.294059,2.584158,0.810891,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,8.989091,6.816364,23.594545,3.505455,0.000000,0.501818,9.394545,1.036364,0.0,0.0,...,0.049091,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,11.761765,5.440196,32.536275,2.898039,0.000000,0.463725,2.217647,1.386275,0.0,0.0,...,0.058824,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45333,4.618182,0.913131,11.783838,24.976768,13.872727,0.826263,0.000000,15.885859,0.0,0.0,...,1.416162,0.296970,12.486869,0.0,0.0,0.0,4.595960,18.693939,12.342424,2.417172
45334,4.434483,0.158621,48.235345,20.407759,14.059483,0.000000,0.000000,0.000000,0.0,0.0,...,1.547414,0.000000,0.000000,0.0,0.0,0.0,6.287069,31.191379,7.149138,1.706897
45335,4.172807,1.557018,46.931579,23.215789,15.981579,0.000000,0.000000,1.257018,0.0,0.0,...,2.155263,0.000000,0.000000,0.0,0.0,0.0,11.061404,28.211404,9.033333,1.311404
45336,3.863636,2.936364,39.587879,30.410101,19.509091,0.036364,0.000000,0.411111,0.0,0.0,...,1.267677,0.000000,0.000000,0.0,0.0,0.0,5.552525,13.486869,10.072727,1.628283


In [10]:
# Testing how to get the correlation values with one feature variable

###########################################
'''
Load in file
'''

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data_files = [file for file in os.listdir("ds-capstone-dataset") if file[-4:] != '.csv']

tmmx = np.load(f"ds-capstone-dataset/{data_files[0]}", allow_pickle=True)

tmmx_pd = pd.DataFrame(tmmx)


###########################################

# create empty df to store the monthly averages
month_avg = pd.DataFrame(index=tmmx_pd.index)

MONTHS = 30

# get monthly averages and store in above df
for i in range(7):
    month_avg[f"month_{i}"] = tmmx_pd.iloc[:, MONTHS*i:MONTHS*(i+1)].mean(axis=1)

# add in the yield data
month_avg['yield'] = yield_pd['yield']
month_avg['year'] = yield_pd['year']

correlation = month_avg.drop(['yield', 'year'], axis=1).corrwith(month_avg['yield'])

month_avg = pd.DataFrame(scaler.fit_transform(month_avg), columns=month_avg.columns)

per_year = month_avg.groupby('year').apply(
    lambda g: g.corr(numeric_only=True)['yield'].drop('yield'), 
    include_groups=False
)
avg_corr = per_year.mean()  # mean correlation across years

#avg_corr



correlation

# calculate correlation
#month_avg.corr()['yield']

month_0   -0.001686
month_1   -0.006828
month_2   -0.029707
month_3   -0.060851
month_4   -0.049009
month_5    0.013046
month_6    0.003713
dtype: float64

In [9]:
avg_corr

yield
month_0   -0.017707
month_1   -0.011220
month_2   -0.019422
month_3   -0.050436
month_4   -0.041896
month_5   -0.016372
month_6   -0.019058
dtype: float64

In [21]:
month_avg

Unnamed: 0,month_0,month_1,month_2,month_3,month_4,month_5,month_6,yield
0,161.055970,177.550498,183.876368,184.752488,181.681095,168.552985,216.733582,27.0
1,190.866667,190.398889,186.020000,184.627778,175.312222,178.228889,239.553333,28.0
2,161.719472,183.502970,178.482508,178.655116,176.542904,147.563366,220.121452,29.0
3,175.986061,189.949091,190.900000,191.711515,188.944848,181.100000,226.269697,28.0
4,161.505556,184.398366,183.523529,177.705882,175.454575,136.226797,216.332680,24.0
...,...,...,...,...,...,...,...,...
45333,194.834343,153.090236,148.155556,200.341414,170.304714,149.070034,205.508418,182.0
45334,204.765517,164.434483,159.061782,206.979885,172.460057,159.325000,209.877874,156.8
45335,205.052047,161.303801,157.759064,210.105848,176.508772,152.753216,206.878070,163.4
45336,205.292929,163.810101,156.924242,198.673737,167.447475,157.882828,208.330303,160.4


In [17]:
'''
Flow: 
go through all of the feature variable dfs. have a global list that stores all of the correlation results. 
calculate the correlations as in the above cell. end up with a Series array
add the result to the global list
create a df out of the global list after finished with the for loop
'''

MONTHS = 30

avg_corr_list = []
corr = []
corr_labels = []

for data_file in data_files:
    f_var = np.load(f"ds-capstone-dataset/{data_file}", allow_pickle=True)
    feature = data_file[2:-19]
    f_var_df = pd.DataFrame(f_var)
    month_avg = pd.DataFrame(index=f_var_df.index)
    for i in range(7):
        month_avg[f"month_{i}"] = f_var_df.iloc[:, MONTHS*i:MONTHS*(i+1)].mean(axis=1)
    month_avg['yield'] = yield_pd['yield']
    month_avg['year'] = yield_pd['year']
    
    correlation = month_avg.drop(['yield', 'year'], axis=1).corrwith(month_avg['yield'])
    corr.append(correlation)
    
    scaler = StandardScaler()

    month_avg = pd.DataFrame(scaler.fit_transform(month_avg), columns=month_avg.columns)

    per_year = month_avg.groupby('year').apply(
        lambda g: g.corr(numeric_only=True)['yield'].drop('yield'),
        include_groups=False
    )
    avg_corr = per_year.mean()  # mean correlation across years

    
    avg_corr_list.append(avg_corr)
    corr_labels.append(feature)

avg_corr_pd = pd.DataFrame(avg_corr_list)
avg_corr_pd.index = corr_labels

corr_pd = pd.DataFrame(corr)
corr_pd.index = corr_labels

corr_pd

Unnamed: 0,month_0,month_1,month_2,month_3,month_4,month_5,month_6
tmmx,-0.001686,-0.006828,-0.029707,-0.060851,-0.049009,0.013046,0.003713
rmax,-0.001272,-0.060293,-0.034648,-0.046901,-0.057406,-0.057485,-0.01912
vs,0.005511,0.005128,-0.000983,-0.001293,-0.00028,-0.000515,0.001591
sph,-0.007118,-0.007125,-0.007115,-0.007129,-0.007128,-0.007113,-0.007121
srad,-0.017719,0.016292,0.02105,0.117362,0.024153,0.045722,-0.009799
vpd,-0.006984,-0.005941,-0.011048,-0.01515,-0.012617,-0.006202,-0.006685
rmin,0.020569,-0.011258,0.077953,0.116357,0.094789,0.01574,-0.007784
pr,0.007113,-0.002357,0.030936,0.016501,0.012553,0.002565,-0.008558
tmmn,0.018965,0.015703,0.016721,-0.021422,-0.012169,0.028406,0.018853
th,-0.15441,-0.108458,-0.031651,-0.060546,-0.013214,-0.23379,-0.063056


In [18]:
avg_corr_pd

yield,month_0,month_1,month_2,month_3,month_4,month_5,month_6
tmmx,-0.017707,-0.01122,-0.019422,-0.050436,-0.041896,-0.016372,-0.019058
rmax,-0.000892,-0.057254,-0.040346,0.009782,0.01326,-0.022303,-0.011624
vs,0.012046,0.012835,-0.002078,0.001112,0.001635,0.005149,0.007411
sph,-0.01555,-0.020062,-0.009985,-0.001831,-0.003584,-0.013734,-0.019131
srad,-0.063547,-0.094053,-0.022353,-0.030989,-0.078045,-0.013272,-0.048278
vpd,-0.014259,0.007176,-0.023316,-0.039222,-0.036544,-0.014284,-0.014056
rmin,-0.032816,-0.077193,0.002675,0.118134,0.101447,-0.012234,-0.035896
pr,-0.01289,-0.019381,0.016808,0.02335,0.020454,-0.004599,-0.015315
tmmn,-0.005702,-0.002426,0.002549,-0.014692,-0.013704,-0.014978,-0.014438
th,-0.088859,-0.0644,-0.067599,-0.05969,-0.00904,-0.002576,-0.011919


In [6]:


MONTHS = 30

corr = []
corr_labels = []

for data_file in data_files:
    f_var = np.load(f"ds-capstone-dataset/{data_file}", allow_pickle=True)
    feature = data_file[2:-19]
    f_var_df = pd.DataFrame(f_var)
    month_avg = pd.DataFrame(index=f_var_df.index)
    for i in range(7):
        month_avg[f"month_{i}"] = f_var_df.iloc[:, MONTHS*i:MONTHS*(i+1)].mean(axis=1)
    month_avg['yield'] = yield_pd['yield']
    month_avg['year'] = yield_pd['year']

    #scaler = StandardScaler()

    #month_avg = pd.DataFrame(scaler.fit_transform(month_avg), columns=month_avg.columns)

    per_year = month_avg.groupby('year').apply(
        lambda g: g.corr(numeric_only=True)['yield'].drop('yield'),
        include_groups=False
    )
    avg_corr = per_year.mean()  # mean correlation across years

    
    corr.append(avg_corr)
    corr_labels.append(feature)

corr_pd = pd.DataFrame(corr)
corr_pd.index = corr_labels

corr_pd

yield,month_0,month_1,month_2,month_3,month_4,month_5,month_6
tmmx,-0.017707,-0.01122,-0.019422,-0.050436,-0.041896,-0.016372,-0.019058
rmax,-0.000892,-0.057254,-0.040346,0.009782,0.01326,-0.022303,-0.011624
vs,0.012046,0.012835,-0.002078,0.001112,0.001635,0.005149,0.007411
sph,-0.01555,-0.020062,-0.009985,-0.001831,-0.003584,-0.013734,-0.019131
srad,-0.063547,-0.094053,-0.022353,-0.030989,-0.078045,-0.013272,-0.048278
vpd,-0.014259,0.007176,-0.023316,-0.039222,-0.036544,-0.014284,-0.014056
rmin,-0.032816,-0.077193,0.002675,0.118134,0.101447,-0.012234,-0.035896
pr,-0.01289,-0.019381,0.016808,0.02335,0.020454,-0.004599,-0.015315
tmmn,-0.005702,-0.002426,0.002549,-0.014692,-0.013704,-0.014978,-0.014438
th,-0.088859,-0.0644,-0.067599,-0.05969,-0.00904,-0.002576,-0.011919


# Window Analysis
- Apply a 60-day rolling window across the growing season.
- For each window, compute correlations between yield and temperature and precipitation.
- Identify the single window (e.g., June–July) that best predicts yield using temperature and precipitation (use linear regression for prediction).

# Yield–Weather Sensitivity 
- Compare critical windows across counties.
- Highlight whether yield is more sensitive to early-season stress (planting) or mid-season stress (growing).

# Getting Growing Season Averages

- Already ran and uploaded files to Github. Does not need to be repeated

In [18]:
import pandas as pd
import numpy as np
import os

data_filepath = '../../Data/ds-capstone-dataset'
os.chdir(data_filepath)

data_files = [file for file in os.listdir(data_filepath) if file[-4:] != '.csv']

labels = pd.read_csv(f"label_1980_2018_rawyield.csv")

# Rename first column
labels = labels.rename(columns={"Unnamed: 0":"year_index"})

thirds = 70

print(data_files)

['X_tmmx_1980_2018_rawyield', 'X_rmax_1980_2018_rawyield', 'X_vs_1980_2018_rawyield', 'X_sph_1980_2018_rawyield', 'X_srad_1980_2018_rawyield', 'X_vpd_1980_2018_rawyield', 'X_rmin_1980_2018_rawyield', 'X_pr_1980_2018_rawyield', 'X_tmmn_1980_2018_rawyield', 'X_th_1980_2018_rawyield']


In [33]:
for file in data_files:
    feature = file[2:-19]
    data = np.load(f'{data_filepath}/{file}', allow_pickle=True)
    data_df = pd.DataFrame(data)
    cols1 = data_df.iloc[:, :thirds]
    cols2 = data_df.iloc[:, thirds:2*thirds]
    cols3 = data_df.iloc[:, 2*thirds:] 
    feature_growing_phases = pd.DataFrame({
        'germination': cols1.mean(axis=1),
        'growth': cols2.mean(axis=1),
        'maturity': cols3.mean(axis=1)
    })
    feature_growing_phases.to_csv(f"../{feature}_avg_growing_phase.csv")

