In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [64]:
# choose a list of 5 countries to start with abundant data
countries = ['Australia', 'Canada', 'Spain', 'Norway', 'Brazil']

In [57]:
def compute_percent_change_features(country_list, start_date, train_date, test_date):
    def percent_change(new, original):
        return (new - original) / original
    
    # store dataframes in lists
    df_list = []

    for country in countries:
        country_df = pd.read_csv(f'{country}_biodiversity_research.csv')
        country_df = country_df[country_df['Country Specific Authors'] > 0]
        country_df = country_df.reset_index(drop=True)

        # get papers from 1998 and onwards
        country_df = country_df[country_df['Publication Year'] >= 1998]
        country_df = country_df.reset_index(drop=True)
        df_list.append(country_df)

    country_train = []
    country_test = []

    for country_data in df_list:
        train_data = pd.DataFrame(dict({'Year': np.zeros(train_date - 1998 + 1), 'Paper Volume': np.zeros(train_date - 1998 + 1), 'Mean Adj. Citations': np.zeros(train_date - 1998 + 1), 'Mean Authors': np.zeros(train_date - 1998 + 1), 'Mean Orgs': np.zeros(train_date - 1998 + 1)}))
        test_data = pd.DataFrame(dict({'Year': np.zeros(test_date - train_date), 'Paper Volume': np.zeros(test_date - train_date), 'Mean Adj. Citations': np.zeros(test_date - train_date), 'Mean Authors': np.zeros(test_date - train_date), 'Mean Orgs': np.zeros(test_date - train_date)}))
        for date in range(1998, test_date + 1):
            year_data = country_data[country_data['Publication Year'] == date]
            year_data = year_data.reset_index(drop=True)

            if date <= train_date:
                train_data.iloc[date - 1998] = [date, len(year_data), year_data['Adjusted Citations'].mean(), year_data['Country Specific Authors'].mean(), year_data['Country Specific Orgs'].mean()]
            else:
                test_data.iloc[date - train_date - 1] = [date, len(year_data), year_data['Adjusted Citations'].mean(), year_data['Country Specific Authors'].mean(), year_data['Country Specific Orgs'].mean()]

        country_train.append(train_data)
        country_test.append(test_data)
    
    protected_df = pd.read_csv('protected_land_cleaned.csv')
    
    # now compute the percent changes starting from start_date (i.e. can start analysis in start_date + 1)
    
    percent_change_train = []
    percent_change_test = []
    percent_change_protected_test = []
    percent_change_protected_train = []

    for train_df in country_train:
        percent_train = pd.DataFrame(dict({'Change Volume': np.zeros(train_date - start_date), 'Change Citations': np.zeros(train_date - start_date), 'Change Authors': np.zeros(train_date - start_date), 'Change Orgs': np.zeros(train_date - start_date)}))
        for i in range(train_date - start_date):
            percent_train.iloc[i] = [percent_change(train_df.iloc[10 + i]['Paper Volume'], train_df.iloc[9 + i]['Paper Volume']), percent_change(train_df.iloc[10 + i]['Mean Adj. Citations'], train_df.iloc[9 + i]['Mean Adj. Citations']), percent_change(train_df.iloc[10 + i]['Mean Authors'], train_df.iloc[9 + i]['Mean Authors']), percent_change(train_df.iloc[10 + i]['Mean Orgs'], train_df.iloc[9 + i]['Mean Orgs'])]
        percent_change_train.append(percent_train)

    for test_df in country_test:    
        percent_test = pd.DataFrame(dict({'Change Volume': np.zeros(test_date - train_date - 1), 'Change Citations': np.zeros(test_date - train_date - 1), 'Change Authors': np.zeros(test_date - train_date - 1), 'Change Orgs': np.zeros(test_date - train_date - 1)}))
        for i in range(test_date - train_date - 1):
            percent_test.iloc[i] = [percent_change(test_df.iloc[1 + i]['Paper Volume'], test_df.iloc[0 + i]['Paper Volume']), percent_change(test_df.iloc[1 + i]['Mean Adj. Citations'], test_df.iloc[0 + i]['Mean Adj. Citations']), percent_change(test_df.iloc[1 + i]['Mean Authors'], test_df.iloc[0 + i]['Mean Authors']), percent_change(test_df.iloc[1 + i]['Mean Orgs'], test_df.iloc[0 + i]['Mean Orgs'])]
        percent_change_test.append(percent_test)

    for country in countries:
        country_protected = protected_df[protected_df['Country'] == country]
        country_protected = country_protected[country_protected['Year'] >= start_date]
        country_protected = country_protected[country_protected['Year'] <= test_date]
        country_protected = country_protected.reset_index(drop=True)

        protect_change_train = pd.DataFrame(dict({'Change Protected Percent': np.zeros(train_date - start_date)}))
        for i in range(train_date - start_date):
            protect_change_train.iloc[i] = [percent_change(country_protected.iloc[1 + i]['Value'], country_protected.iloc[0 + i]['Value'])]
        percent_change_protected_train.append(protect_change_train)

        protect_change_test = pd.DataFrame(dict({'Change Protected Percent': np.zeros(test_date - train_date - 1)}))
        for i in range(test_date - train_date - 1):
            protect_change_test.iloc[i] = [percent_change(country_protected.iloc[train_date - start_date + 2 + i]['Value'], country_protected.iloc[train_date - start_date + 1 + i]['Value'])]
        percent_change_protected_test.append(protect_change_test)
        
    return percent_change_train, percent_change_test, percent_change_protected_train, percent_change_protected_test

In [65]:
X_train_list, X_test_list, y_train_list, y_test_list = compute_percent_change_features(countries, 2007, 2015, 2021)

  country_df = pd.read_csv(f'{country}_biodiversity_research.csv')
  country_df = pd.read_csv(f'{country}_biodiversity_research.csv')
  country_df = pd.read_csv(f'{country}_biodiversity_research.csv')
  country_df = pd.read_csv(f'{country}_biodiversity_research.csv')


In [66]:
X_train_list[0]

Unnamed: 0,Change Volume,Change Citations,Change Authors,Change Orgs
0,4.142857,0.037437,-0.105556,0.010163
1,0.680556,-0.123062,0.091163,-0.011058
2,0.206612,0.430216,0.002149,0.055859
3,0.106164,-0.069008,0.013509,0.078415
4,0.139319,0.027436,0.084505,0.076754
5,0.214674,0.047168,-0.101331,-0.115613
6,0.228188,0.013154,0.077536,0.048479
7,0.176685,-0.118305,0.112391,0.089999


In [80]:
def calc_MSE(y_pred, y_true):
    return (1/len(y_pred))*sum(((np.array(y_pred)-np.array(y_true))**2))[0]

In [81]:
# build a simple Linear Regression Model for Country 0 (Australia)

regr = LinearRegression()
regr = regr.fit(X_train_list[0], y_train_list[0])

# get MSE on the test set
calc_MSE(regr.predict(X_test_list[0]), y_test_list[0])

0.012589574822727679

In [84]:
from sklearn.preprocessing import PolynomialFeatures

# choose polynomial order than minimizes MSE on test set
degrees = list(range(1, 11))

for degree in degrees:
    # polynomial feature transform
    poly = PolynomialFeatures(degree)
    X_poly_train = poly.fit_transform(X_train_list[0])
    X_poly_test = poly.transform(X_test_list[0])

    # fit a model with the new polynomial features
    poly_regr = LinearRegression()
    poly_regr = poly_regr.fit(X_poly_train, y_train_list[0])

    # get MSE on test set
    print(f'MSE of Degree {degree} polynomial: {calc_MSE(poly_regr.predict(X_poly_test), y_test_list[0])}')

MSE of Degree 1 polynomial: 0.012589574822727677
MSE of Degree 2 polynomial: 0.02852839058164885
MSE of Degree 3 polynomial: 0.01062191798749243
MSE of Degree 4 polynomial: 0.013470092861679044
MSE of Degree 5 polynomial: 0.013034545015961907
MSE of Degree 6 polynomial: 0.013479253846113166
MSE of Degree 7 polynomial: 0.013506837887267581
MSE of Degree 8 polynomial: 0.013582727764682495
MSE of Degree 9 polynomial: 0.013599283226812836
MSE of Degree 10 polynomial: 0.013613139260290964


In [87]:
# train the optimal degree 3 polynomial
poly = PolynomialFeatures(3)
X_poly_train = poly.fit_transform(X_train_list[0])
X_poly_test = poly.transform(X_test_list[0])

# fit a model with the new polynomial features
poly_regr = LinearRegression()
poly_regr = poly_regr.fit(X_poly_train, y_train_list[0])

print('Predictions:\n', poly_regr.predict(X_poly_test))
print('\nActual Values:\n', y_test_list[0]['Change Protected Percent'])

Predictions:
 [[ 0.03921306]
 [ 0.06903876]
 [ 0.08466675]
 [ 0.03615381]
 [-0.20262732]]

Actual Values:
 0    0.013382
1    0.005280
2    0.000147
3    0.049414
4    0.000000
Name: Change Protected Percent, dtype: float64


In [None]:
# Next steps: Add latent variables to X_train data