In [137]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, RANSACRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor

In [223]:
train = pd.read_csv('train.csv')
formula_train = pd.read_csv('formula_train.csv')

In [130]:
formulas = formula_train['material'].str.findall('([A-Z][a-z]*)([0-9.]*)')
a = []
for k, fe in enumerate(formulas):
    elements = formula_train[[f[0] for f in fe]].iloc[k, :]
    amount = np.array([float(f[1]) if f[1] !='' else 1 for f in fe ])
    a.append(np.sum(elements))
train['formula'] = np.array(a)

In [224]:
formulas = formula_train['material'].str.findall('([A-Z][a-z]*)([0-9.]*)')
formulas

0                 [(Eu, 1), (Fe, 1.5), (Ru, 0.5), (As, 2)]
1             [(Bi, 2), (Sr, 2), (Ca, 1), (Cu, 2), (O, 8)]
2                                     [(N, 0.3), (S, 0.7)]
3                 [(Ba, 1), (Fe, 1.9), (Co, 0.1), (As, 2)]
4                                      [(Rh, 17), (S, 15)]
                               ...                        
16995                           [(La, 2), (Cu, 1), (O, 4)]
16996    [(Bi, 2), (Sr, 2), (Ca, 1), (Cu, 1.2), (Li, 0....
16997                               [(Os, 0.9), (Ru, 0.1)]
16998               [(Sm, 1), (Ba, 2), (Cu, 3), (O, 6.73)]
16999    [(Ba, 1), (Fe, 1.832), (Co, 0.15), (Mn, 0.018)...
Name: material, Length: 17000, dtype: object

In [225]:
for i, j in enumerate(formulas):
    for l, v in j:
        if v != '':
            formula_train.at[i, l] = formula_train.at[i, l] + np.float(v)
        else:
            formula_train.at[i, l] = formula_train.at[i, l] + 1

In [226]:
data = pd.concat((train, formula_train.iloc[:, :-1]),axis=1)

In [213]:
data.corr()['cri']

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,Ir,Pt,Au,Hg,Tl,Pb,Bi,Po,At,Rn
feature1,1.000000,-0.143847,-0.355671,-0.293672,-0.455712,0.939222,0.882318,0.681398,-0.321852,0.511752,...,-0.021827,-0.022415,-0.012088,0.015872,0.062713,-0.007323,0.104758,,,
feature2,-0.143847,1.000000,0.816045,0.940250,0.746522,-0.104769,-0.100459,0.123859,0.450905,0.195576,...,0.039385,0.083620,0.029000,0.053613,0.088983,0.040947,0.066590,,,
feature3,-0.355671,0.816045,1.000000,0.847721,0.964556,-0.309041,-0.415397,-0.144756,0.721212,-0.060214,...,0.044143,0.081542,0.037440,0.056701,0.063759,0.052774,0.078647,,,
feature4,-0.293672,0.940250,0.847721,1.000000,0.856621,-0.189586,-0.234388,-0.177170,0.462499,-0.122259,...,0.038677,0.075443,0.030253,0.024280,0.050419,0.038375,0.026250,,,
feature5,-0.455712,0.746522,0.964556,0.856621,1.000000,-0.370502,-0.486958,-0.351091,0.678597,-0.272247,...,0.039109,0.071033,0.036323,0.036428,0.031672,0.047812,0.035543,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,-0.007323,0.040947,0.052774,0.038375,0.047812,-0.006950,-0.012312,0.009218,0.048532,0.011668,...,-0.000601,-0.002153,-0.000357,-0.000790,-0.001596,1.000000,0.011989,,,
Bi,0.104758,0.066590,0.078647,0.026250,0.035543,0.064118,0.055389,0.186317,0.078418,0.169601,...,-0.003570,-0.014563,-0.002192,-0.010604,-0.011926,0.011989,1.000000,,,
Po,,,,,,,,,,,...,,,,,,,,,,
At,,,,,,,,,,,...,,,,,,,,,,


In [227]:
correlation2 = abs(data.corr()['critical_temperature'])
correlation2

feature1    0.601695
feature2    0.115790
feature3    0.313932
feature4    0.233108
feature5    0.371676
              ...   
Pb          0.013307
Bi          0.169786
Po               NaN
At               NaN
Rn               NaN
Name: critical_temperature, Length: 168, dtype: float64

In [231]:
fet = data[correlation2[correlation2 >=0.2].keys()]

In [133]:
correlation = abs(train.corr()['critical_temperature'])
correlation

feature1                0.601695
feature2                0.115790
feature3                0.313932
feature4                0.233108
feature5                0.371676
                          ...   
feature79               0.440456
feature80               0.208518
feature81               0.301859
critical_temperature    1.000000
formula                 0.116671
Name: critical_temperature, Length: 83, dtype: float64

In [239]:
features = train[correlation[correlation >=0.6].keys()]
features = data[correlation2[correlation2 >=0.6].keys()]
X = features.iloc[:, :-1].values
y = features.iloc[:, -1].values

In [240]:
cl = AdaBoostRegressor(base_estimator=RandomForestRegressor(n_estimators=50))
cv = ShuffleSplit(n_splits=10, test_size=0.2)
cross_val_score(cl, X, y=y, cv=cv, scoring='r2').mean()

0.9018408696497004

In [None]:
cl.fit(X, y)