In [195]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [174]:
ndvi_df = pd.read_csv('Eight Day NDVI.csv')
soil_df = pd.read_csv('Daily Soil Mositure.csv')
temperature_df = pd.read_csv('Daily Temperature.csv')
production_df = pd.read_csv('Production Quantity.csv')
precipitation_df = pd.read_csv('Daily Precipitation.csv')

In [175]:
ndvi_df['year'] = pd.to_datetime(ndvi_df['start_date']).dt.year
ndvi_df['month'] = pd.to_datetime(ndvi_df['start_date']).dt.month

soil_df['year'] = pd.to_datetime(soil_df['start_date']).dt.year
soil_df['month'] = pd.to_datetime(soil_df['start_date']).dt.month

precipitation_df['year'] = pd.to_datetime(precipitation_df['start_date']).dt.year
precipitation_df['month'] = pd.to_datetime(precipitation_df['start_date']).dt.month

temperature_df['year'] = pd.to_datetime(temperature_df['start_date']).dt.year
temperature_df['month'] = pd.to_datetime(temperature_df['start_date']).dt.month

production_df['year'] = pd.to_datetime(production_df['start_date']).dt.year
production_df['month'] = pd.to_datetime(production_df['start_date']).dt.month

In [176]:
ndvi_avg = ndvi_df.groupby(['region_id', ndvi_df['year'], ndvi_df['month']])['ndvi'].mean().reset_index(name = 'ndvi')
soil_avg = soil_df.groupby(['region_id', soil_df['year'], soil_df['month']])['smos'].mean().reset_index(name = 'moisture')
temp_avg = temperature_df.groupby(['region_id', temperature_df['year'], temperature_df['month']])['temp'].mean().reset_index(name = 'temperature')
precip_avg = precipitation_df.groupby(['region_id', precipitation_df['year'], precipitation_df['month']])['precip'].mean().reset_index(name = 'precipitation')

In [177]:
temp_soil = pd.merge(temp_avg, soil_avg, on = ["region_id", "year", "month"])
temp_soil_precip = pd.merge(temp_soil, precip_avg, on = ["region_id", "year", "month"])
temp_soil_precip_ndvi_df = pd.merge(temp_soil_precip, ndvi_avg, on = ["region_id", "year", "month"])
# temp_soil_precip_ndvi_df

In [178]:
temp_soil_precip_ndvi_norm_df = (temp_soil_precip_ndvi_df - temp_soil_precip_ndvi_df.min())/(temp_soil_precip_ndvi_df.max()-temp_soil_precip_ndvi_df.min())
# temp_soil_precip_ndvi_norm_df

In [179]:
temp_soil_precip_ndvi_norm_df['region_id'] = temp_soil_precip_ndvi_df['region_id']
temp_soil_precip_ndvi_norm_df['year'] = temp_soil_precip_ndvi_df['year']
temp_soil_precip_ndvi_norm_df['month'] = temp_soil_precip_ndvi_df['month']
# temp_soil_precip_ndvi_norm_df 

In [186]:
X_test_df = temp_soil_precip_ndvi_norm_df[temp_soil_precip_ndvi_norm_df['year'] == 2021]
X_test_df = X_test_df = X_test_df.drop(['year', 'month'],axis=1)
X_test_df

Unnamed: 0,region_id,temperature,moisture,precipitation,ndvi
84,93,0.276787,0.929327,0.432124,0.753153
85,93,0.464329,0.790686,0.009145,0.650753
86,93,0.605155,0.649088,0.189481,0.724333
87,93,0.571190,0.780011,0.227691,0.866430
88,93,0.467854,0.723631,0.246405,0.895116
...,...,...,...,...,...
978,105,0.450528,0.509700,0.217596,0.864120
979,105,0.441132,0.467040,0.246801,0.846155
980,105,0.471669,0.533600,0.202741,0.802791
981,105,0.300689,0.845618,0.450888,0.814903


In [188]:
train_df = pd.merge(temp_soil_precip_ndvi_norm_df, production_df, on=["region_id", "year", "month"])
train_df = train_df.drop(['start_date', 'end_date', 'year', 'month'],axis=1)
train_df

Unnamed: 0,region_id,temperature,moisture,precipitation,ndvi,prod
0,93,0.267484,0.640887,0.120192,0.618041,171725
1,93,0.455030,0.570826,0.029788,0.608247,188325
2,93,0.470265,0.468751,0.093544,0.581497,247856
3,93,0.469720,0.460857,0.292363,0.720801,282791
4,93,0.309228,0.542776,0.251008,0.807937,291057
...,...,...,...,...,...,...
711,105,0.519177,0.401056,0.212087,0.843223,57818
712,105,0.572812,0.534697,0.189893,0.831680,57474
713,105,0.480106,0.535196,0.315336,0.850742,51821
714,105,0.237488,0.598133,0.496803,0.799011,44947


In [200]:
def get_train_data(region_id, train_df):
    train = train_df[train_df['region_id']==region_id]
    train = train.drop(['region_id'],axis=1)
    X_train = pd.DataFrame(train[['temperature','moisture','precipitation','ndvi']])
    y_train = pd.DataFrame(train['prod'])
    return X_train, y_train

In [201]:
def get_test_data(region_id, test_df):
    test = test_df[test_df['region_id']==region_id]
    X_test = test.drop(['region_id'],axis=1)
    return X_test

In [212]:
def production_prediction(X_train, y_train, X_test):
    model = LinearRegression()
    model.fit(X_train,y_train)
    return model.predict(X_test)

In [226]:
predictions = []
regions = train_df['region_id'].unique()
for region in regions:
    X_train, y_train = get_train_data(region, train_df)
    X_test = get_test_data(region, X_test_df)
    predict = np.asarray(production_prediction(X_train, y_train, X_test),dtype = 'int')
    predictions.append(predict)
    # predictions.append(production_prediction(X_train, y_train, X_test))
predictions = np.array(predictions)
predictions = predictions.flatten()

In [248]:
output = pd.read_csv('predicted_production_qty.csv')
output_df = pd.DataFrame(data = predictions)
output['prod'] = output_df[0]
output = output.reindex(columns =['start_date', 'end_date',  'region_id', 'prod' ])
output.to_csv('mdskabid@usc.edu.csv')

In [249]:
output

Unnamed: 0,start_date,end_date,region_id,prod
0,2021-01-01T00:00:00.000Z,2021-01-31T00:00:00.000Z,93,247662
1,2021-02-01T00:00:00.000Z,2021-02-28T00:00:00.000Z,93,235785
2,2021-03-01T00:00:00.000Z,2021-03-31T00:00:00.000Z,93,263294
3,2021-04-01T00:00:00.000Z,2021-04-30T00:00:00.000Z,93,262955
4,2021-05-01T00:00:00.000Z,2021-05-31T00:00:00.000Z,93,254414
...,...,...,...,...
115,2021-08-01T00:00:00.000Z,2021-08-31T00:00:00.000Z,105,44999
116,2021-09-01T00:00:00.000Z,2021-09-30T00:00:00.000Z,105,45090
117,2021-10-01T00:00:00.000Z,2021-10-31T00:00:00.000Z,105,45168
118,2021-11-01T00:00:00.000Z,2021-11-30T00:00:00.000Z,105,54350
