# Baseline Submission for the Challenge SPCRT

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Load Data

In [2]:
train_data = pd.read_csv('aicrowd_educational_spcrt/data/public/train.csv')

## Clean and analyse the data

In [4]:
train_data.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,3,86.2991,65.78961,64.984139,49.7654,0.836621,1.013759,146.8813,20.95061,63.713516,...,3.5,3.301927,3.464102,1.0889,0.971342,1,1.4,0.471405,0.5,4.5
1,5,72.952854,56.414763,59.186241,35.639703,1.445795,1.04152,122.90607,35.383159,40.250192,...,2.257143,2.168944,2.219783,1.594167,1.08748,1,1.131429,0.4,0.437059,7.6
2,6,82.318112,99.033554,53.069787,71.259834,1.427749,1.324091,192.981,40.19614,70.933858,...,4.3,3.203101,3.772087,1.647214,1.510613,5,1.58,1.950783,1.791647,3.01
3,4,57.444449,60.47665,56.067907,58.936797,1.362775,1.128041,34.8436,27.02198,12.367487,...,3.65,3.309751,3.442623,1.333736,1.089489,3,1.8,1.118034,1.19478,14.1
4,4,76.517718,56.808817,59.310096,35.773432,1.197273,0.98188,122.90607,34.83316,44.289459,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,36.8


In [5]:
train_data.describe()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
count,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,...,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0,18073.0
mean,4.116527,87.495853,72.915281,71.193951,58.444208,1.165612,1.064409,115.732133,33.213727,44.442844,...,3.152312,3.056546,3.054714,1.296028,1.054028,2.044708,1.481685,0.841078,0.676041,34.492796
std,1.439625,29.586564,33.320437,30.920472,36.470563,0.365019,0.401233,54.718595,26.886071,20.068666,...,1.189356,1.043451,1.172383,0.392761,0.380274,1.242861,0.976455,0.485247,0.455984,34.307997
min,1.0,6.941,6.941,5.685033,3.193745,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00021
25%,3.0,72.45124,52.177725,58.001648,35.25859,0.969858,0.777619,78.35315,16.83045,32.890369,...,2.118056,2.279705,2.092115,1.060857,0.778998,1.0,0.920286,0.471405,0.308515,5.4
50%,4.0,84.84188,60.786693,66.361592,39.898482,1.199541,1.146366,122.90607,26.658401,45.1295,...,2.618182,2.615321,2.433589,1.368922,1.16541,2.0,1.062667,0.8,0.5,20.0
75%,5.0,100.351275,85.99413,78.019689,73.097796,1.444537,1.360442,155.006,38.360375,59.663892,...,4.03,3.741657,3.920517,1.589027,1.331926,3.0,1.92,1.2,1.021023,63.0
max,9.0,208.9804,208.9804,208.9804,208.9804,1.983797,1.958203,207.97246,205.58991,101.0197,...,7.0,7.0,7.0,2.141963,1.949739,6.0,6.9922,3.0,3.0,185.0


## Split Data for Train and Validation

In [6]:
X = train_data.drop('critical_temp',1)
y = train_data['critical_temp']
# Validation testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Define the Classifier and Train

In [7]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Check which variables have the most impact

In [8]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
coeff_df.head()

Unnamed: 0,Coefficient
number_of_elements,-4.202422
mean_atomic_mass,0.833105
wtd_mean_atomic_mass,-0.881193
gmean_atomic_mass,-0.51061
wtd_gmean_atomic_mass,0.64218


## Predict on validation

In [9]:
y_pred = regressor.predict(X_val)

In [11]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
df1 = df.head(25)

## Evaluate the Performance

In [12]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

Mean Absolute Error: 13.42086725495139
Mean Squared Error: 323.28465055058496
Root Mean Squared Error: 17.98011820179681


## Load Test Set

In [13]:
test_data = pd.read_csv('aicrowd_educational_spcrt/data/public/test.csv')

In [14]:
test_data.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,mean_Valence,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence
0,2,82.76819,87.837285,82.144935,87.360109,0.685627,0.509575,20.27638,51.522285,10.13819,...,4.5,4.75,4.472136,4.728708,0.686962,0.514653,1,2.75,0.5,0.433013
1,4,76.444563,81.45675,59.356672,68.229617,1.199541,1.108189,121.3276,36.950657,43.823354,...,2.25,2.142857,2.213364,2.119268,1.368922,1.309526,1,0.571429,0.433013,0.349927
2,5,88.936744,51.090431,70.358975,34.783991,1.445824,1.525092,122.90607,10.438667,46.482335,...,2.4,2.114679,2.352158,2.095193,1.589027,1.314189,1,0.96789,0.489898,0.318634
3,4,76.517718,56.149432,59.310096,35.562124,1.197273,1.042132,122.90607,31.92069,44.289459,...,2.25,2.251429,2.213364,2.214646,1.368922,1.078855,1,1.074286,0.433013,0.433834
4,3,104.60849,89.558979,101.719818,88.48121,1.070258,0.944284,59.94547,33.541423,25.225148,...,5.0,5.811245,4.762203,5.743954,1.05492,0.80399,3,3.024096,1.414214,0.728448


## Predict on test set

In [15]:
y_test = regressor.predict(test_data)

## Save it in correct format

In [17]:
df = pd.DataFrame(y_test,columns=['critical_temp'])
df.to_csv('aicrowd_educational_spcrt/data/public/submission.csv',index=False)

To participate in the challenge click [here](https://www.aicrowd.com/challenges/spcrt-superconductor-critical-temperature)