In [401]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression

In [441]:
x_train = pd.read_csv('./global_data/x_train.csv')
y_train = pd.read_csv('./global_data/y_train.csv')
x_test = pd.read_csv('./global_data/x_test.csv')

In [404]:
x_train_fr = x_train[x_train['COUNTRY'] == 'FR'].drop('COUNTRY', axis=1).fillna(0)
y_train_fr = y_train.merge(x_train_fr)['TARGET'].fillna(0)
x_train_de = x_train[x_train['COUNTRY'] == 'DE'].drop('COUNTRY', axis=1).fillna(0)
y_train_de = y_train.merge(x_train_de)['TARGET'].fillna(0)

In [406]:
def getLR(x, y): 
    total = 0
    loops = 300

    for i in range(loops): 
        xTr, xTe, yTr, yTe = train_test_split(x, y, test_size=0.2, random_state=i) 

        lr = LinearRegression()
        lr.fit(xTr, yTr)
        total = total + 100 * spearmanr(lr.predict(xTe), yTe).correlation
    
    return (total / loops)

In [408]:
def getBestAttributeLR(data, x, prev_corr, y):
    cols = data.columns.tolist()
    corr = 0
    feature = None

    for col in cols:
        x[col] = data[col]
        temp_corr = getLR(x, y)
        
        if temp_corr > corr:
            corr = temp_corr
            feature = col
        else:
            x = x.drop(col, axis=1)

    if corr < prev_corr or len(cols) == 1 or feature == None:
        return x 
    else:
        data = data.drop(feature, axis=1)
        return getBestAttributeLR(data, x, corr, y)


In [409]:
empty_fr = pd.DataFrame() 
empty_fr['ID'] = x_train_fr['ID']
optimum_fr = getBestAttributeLR(x_train_fr, empty_fr, 0, y_train_fr)

In [410]:
optimum_fr

Unnamed: 0,DE_NET_EXPORT,DE_HYDRO,CARBON_RET,ID
0,0.000000,2.209047,-0.002445,1054
1,-0.573520,0.187964,-0.490365,2049
2,-0.622021,-0.108578,0.204952,1924
4,0.000000,-0.230179,2.614378,1101
5,-1.117139,2.306980,1.124457,1520
...,...,...,...,...
1483,-0.977214,1.781299,1.322433,1776
1486,1.403843,-0.385397,1.472650,1401
1487,0.101161,-0.315249,0.606318,1728
1490,0.449153,-0.341147,0.356356,1674


In [411]:
getLR(optimum_fr, y_train_fr)

15.64635502501608

In [412]:
empty_de = pd.DataFrame() 
empty_de['ID'] = x_train_de['ID']
optimum_de = getBestAttributeLR(x_train_de, empty_de, 0, y_train_de)

In [413]:
optimum_de

Unnamed: 0,FR_CONSUMPTION,DE_SOLAR,DE_FR_EXCHANGE,FR_NUCLEAR,ID,DE_HYDRO,FR_DE_EXCHANGE,DE_CONSUMPTION,DE_NET_EXPORT,FR_GAS,FR_COAL
3,-0.849198,1.751523,-0.839586,-1.589554,297,-0.368417,0.839586,-0.983324,-0.270870,0.194659,-0.786025
11,-0.811337,1.172155,0.237105,-1.838556,819,1.282374,-0.237105,-0.055692,-0.851082,-0.145637,-0.770846
12,-0.331101,-0.324789,0.339942,-1.719257,918,-0.168264,-0.339942,0.532116,-0.173123,1.382599,-0.753365
13,-1.062255,0.509242,-1.380464,-1.200788,283,0.523610,1.380464,-0.328286,-1.046122,-0.730992,-0.455367
14,1.629315,-1.234093,1.129663,-0.277100,158,1.223032,-1.129663,1.028987,-0.391261,2.122272,0.857983
...,...,...,...,...,...,...,...,...,...,...,...
1485,-0.235078,-0.394101,0.962203,-0.555283,391,0.128621,-0.962203,0.810509,0.971934,0.219593,-0.132530
1488,-1.051247,0.344296,-1.651718,-1.289413,342,2.053830,1.651718,-0.295522,-1.969871,0.076716,-0.781673
1489,1.106682,-1.325306,-1.855327,0.712492,459,-0.470809,1.855327,1.529204,-0.218658,1.388269,-0.294001
1491,0.489199,-0.626731,-0.255778,-0.964588,748,0.037892,0.255778,0.856399,-1.531544,1.866399,-0.180117


In [414]:
getLR(optimum_de, y_train_de)

38.67580056415262

In [417]:
lr_fr = LinearRegression()
lr_fr.fit(optimum_fr, y_train_fr)
lr_de = LinearRegression()
lr_de.fit(optimum_de, y_train_de)

In [419]:
fr_cols = optimum_fr.columns
de_cols = optimum_de.columns

In [432]:
x_test_fr = x_test[x_test['COUNTRY'] == 'FR'].drop('COUNTRY', axis=1).fillna(0)[fr_cols]
x_test_de = x_test[x_test['COUNTRY'] == 'DE'].drop('COUNTRY', axis=1).fillna(0)[de_cols]

y_pred_de = lr_de.predict(x_test_de)
y_pred_fr = lr_fr.predict(x_test_fr)

y_de = pd.DataFrame()
y_de['ID'] = x_test_de['ID']
y_de['TARGET'] = y_pred_de

y_fr = pd.DataFrame()
y_fr['ID'] = x_test_fr['ID']
y_fr['TARGET'] = y_pred_fr

y_pred = pd.concat([y_fr, y_de])

In [433]:
y_csv = y_pred.to_csv('WACC35_Submission1.csv')

In [468]:
def test_local(fr_model, de_model):
    total = 0
    result = 0
    loops = 1000

    for i in range(loops):
        xTr, xTe, yTr, yTe = train_test_split(x_train, y_train, test_size=0.2, random_state=i) 

        x_test_fr = xTe[xTe['COUNTRY'] == 'FR'].drop('COUNTRY', axis=1).fillna(0)[fr_cols]
        x_test_de = xTe[xTe['COUNTRY'] == 'DE'].drop('COUNTRY', axis=1).fillna(0)[de_cols]

        y_pred_de = de_model.predict(x_test_de)
        y_pred_fr = fr_model.predict(x_test_fr)

        y_de = pd.DataFrame()
        y_de['ID'] = x_test_de['ID']
        y_de['TARGET'] = y_pred_de

        y_fr = pd.DataFrame()
        y_fr['ID'] = x_test_fr['ID']
        y_fr['TARGET'] = y_pred_fr

        y_pred = pd.concat([y_fr, y_de])

        y_pred = y_pred.sort_values(ascending=True, by='ID')
        yTe = yTe.sort_values(ascending=True, by='ID')
        spearman = (100 * spearmanr(y_pred['TARGET'], yTe['TARGET']).correlation)
        total = total + spearman

    result = total / loops
    return result

In [469]:
test_local(lr_fr, lr_de)

32.798683045902614

In [None]:
pls_fr = PLSRegression