In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import scale
import matplotlib
from matplotlib import pylab, gridspec, pyplot as plt
from sklearn.metrics import f1_score
import xgboost
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('train.csv', sep='\t', low_memory=False)

In [3]:
df.head()

Unnamed: 0,pdb_chain,DSSR,xray,resol,chainlen,protein,alpham2,betam2,gammam2,deltam2,...,atomO2p,atomO3p,atomO4,atomO4p,atomO5p,atomO6,atomOP1,atomOP2,atomOP3,mg
0,1b23.cif1_R,R.G.1.,1,2.6,74,1,,,,,...,0,0,0,0,0,0,0,0,1,0
1,1b23.cif1_R,R.G.1.,1,2.6,74,1,,,,,...,0,0,0,0,0,0,1,0,0,0
2,1b23.cif1_R,R.G.1.,1,2.6,74,1,,,,,...,0,0,0,0,0,0,0,1,0,0
3,1b23.cif1_R,R.G.1.,1,2.6,74,1,,,,,...,0,0,0,0,1,0,0,0,0,0
4,1b23.cif1_R,R.G.1.,1,2.6,74,1,,,,,...,0,0,0,1,0,0,0,0,0,0


In [4]:
df.dropna(inplace=True)

In [87]:
describe = df.describe(include='all')

In [142]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', None):
    print(df)

          pdb_chain     DSSR  xray  resol  chainlen  protein  alpham2  betam2  \
35      1b23.cif1_R   R.G.4.     1   2.60        74        1    -60.0   177.7   
36      1b23.cif1_R   R.G.4.     1   2.60        74        1    -60.0   177.7   
37      1b23.cif1_R   R.G.4.     1   2.60        74        1    -60.0   177.7   
38      1b23.cif1_R   R.G.4.     1   2.60        74        1    -60.0   177.7   
39      1b23.cif1_R   R.G.4.     1   2.60        74        1    -60.0   177.7   
...             ...      ...   ...    ...       ...      ...      ...     ...   
237869  5up6.cif1_R  R.G.17.     0   3.74        20        1    -74.3   155.1   
237870  5up6.cif1_R  R.G.17.     0   3.74        20        1    -74.3   155.1   
237871  5up6.cif1_R  R.G.17.     0   3.74        20        1    -74.3   155.1   
237872  5up6.cif1_R  R.G.17.     0   3.74        20        1    -74.3   155.1   
237873  5up6.cif1_R  R.G.17.     0   3.74        20        1    -74.3   155.1   

        gammam2  deltam2  e

In [5]:
def number_encode_features(init_df, col_index):
    result = init_df.copy()
    encoders = {}
    for column in col_index:
        if result.dtypes[column] == np.object:
            encoders[column] = LabelEncoder()
            result[column] = encoders[column].fit_transform(result[column])
    return result, encoders

# col_index = np.ravel(np.argwhere(describe.iloc[1, :] < 4))

In [6]:
col_index = ['pdb_chain' , 'DSSR']
encoded_data, encoders = number_encode_features(df, col_index)
encoded_data.head()

Unnamed: 0,pdb_chain,DSSR,xray,resol,chainlen,protein,alpham2,betam2,gammam2,deltam2,...,atomO2p,atomO3p,atomO4,atomO4p,atomO5p,atomO6,atomOP1,atomOP2,atomOP3,mg
35,0,9577,1,2.6,74,1,-60.0,177.7,50.4,84.8,...,0,0,0,0,0,0,1,0,0,0
36,0,9577,1,2.6,74,1,-60.0,177.7,50.4,84.8,...,0,0,0,0,0,0,0,1,0,0
37,0,9577,1,2.6,74,1,-60.0,177.7,50.4,84.8,...,0,0,0,0,1,0,0,0,0,0
38,0,9577,1,2.6,74,1,-60.0,177.7,50.4,84.8,...,0,0,0,1,0,0,0,0,0,0
39,0,9577,1,2.6,74,1,-60.0,177.7,50.4,84.8,...,0,1,0,0,0,0,0,0,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(encoded_data.iloc[:, 0 : 385], encoded_data.iloc[:, -1], test_size=0.33, random_state=42)

In [200]:
X_train = scale(X_train)

In [201]:
X_test = scale(X_test)

In [202]:
X_train.shape, X_test.shape

((41808, 385), (167235, 385))

In [203]:
def grid_plot(x, y, x_label, title, y_label):
    plt.figure(figsize=(12, 6))
    plt.grid(True)
    plt.plot(x, y, 'go-')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

In [120]:
model = KNeighborsClassifier(n_jobs=-1)

In [144]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=1, pos_label='1')

In [145]:
grid = {'n_neighbors': np.array(np.arange(3, 11, 2), dtype='int')}
gs = GridSearchCV(model, grid, cv=5, n_jobs=-1, scoring='f1', verbose=2)

In [146]:
grid

{'n_neighbors': array([3, 5, 7, 9])}

In [147]:
gs.fit(X_train, y_train)
print(gs.best_params_, gs.best_score_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] n_neighbors=3 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] n_neighbors=5 ...................................................
[CV] n_neighbors=5 ...................................................
[CV] n_neighbors=5 ...................................................


KeyboardInterrupt: 

In [None]:
grid_plot(grid['n_neighbors'], gs.cv_results_['mean_test_score'], 'n_neighbors', 'KNeighborsClassifier', y_label='f1_score')

In [None]:
gs.best_params_, gs.best_score_

In [None]:
y_ans = gs.predict(X_test)

In [8]:
knn = KNeighborsClassifier(n_neighbors=9, n_jobs=-1)

In [9]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=9, p=2,
           weights='uniform')

In [10]:
y_pred = knn.predict(X_test[0 : 20000])

In [11]:
y_pred.shape

(20000,)

In [12]:
f1_score(y_test[0 : 20000], y_pred)

0.181284834262247

In [161]:
df_test = pd.read_csv('test.csv', sep=',')

In [164]:
col_index = ['pdb_chain' , 'DSSR']
encoded_data_test, encoders_test = number_encode_features(df_test, col_index)
encoded_data_test.head()

Unnamed: 0,Id,index,pdb_chain,DSSR,xray,resol,chainlen,protein,alpham2,betam2,...,atomO2,atomO2p,atomO3p,atomO4,atomO4p,atomO5p,atomO6,atomOP1,atomOP2,atomOP3
0,0,28,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,0,0,0,1,0,0
1,1,29,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,0,0,0,0,1,0
2,2,30,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,0,1,0,0,0,0
3,3,31,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,1,0,0,0,0,0
4,4,32,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,1,0,0,0,0,0,0,0


In [165]:
encoded_data_test = scale(encoded_data_test.as_matrix())

In [166]:
encoded_data_test.shape

(4045, 387)

In [167]:
y_ans = knn.predict(encoded_data_test[:, 2: ])

In [52]:
def write_ans(preds):
    answer = pd.DataFrame({'Id': np.arange(preds.shape[0])})
    print(len(answer))
    answer['mg'] = preds
    print(len(preds))
    answer.to_csv('ans.csv', index=False)

In [62]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [9]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
clf = RandomForestClassifier(n_estimators=50)
scoring = 'f1'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

[0.5790499  0.57688835 0.57943925 0.57451598 0.55951843]


In [12]:
clf = RandomForestClassifier(n_estimators=100)
scoring = 'f1'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

[0.58201374 0.57551143 0.58188472 0.57629652 0.5706611 ]


In [13]:
clf = RandomForestClassifier(n_estimators=500)
scoring = 'f1'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

[0.58832247 0.58275965 0.58848549 0.58072217 0.57518853]


In [11]:
clf = GaussianNB()
scoring = 'f1'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

[0.25960031 0.25950653 0.2531977  0.25124626 0.2663898 ]


In [60]:
clf = xgboost.XGBClassifier()
scoring = 'f1'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


[0.01033325 0.00312012 0.00742312 0.01155462 0.00871571]


In [63]:
clf = AdaBoostClassifier()
scoring = 'f1'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

[0.02190525 0.02394904 0.03254132 0.03234917 0.03299175]


In [14]:
df_test = pd.read_csv('test.csv', sep=',')

In [15]:
df_test.dropna(inplace=True)
col_index = ['pdb_chain' , 'DSSR']
encoded_data_test, encoders_test = number_encode_features(df_test, col_index)
encoded_data_test.head()

Unnamed: 0,Id,index,pdb_chain,DSSR,xray,resol,chainlen,protein,alpham2,betam2,...,atomO2,atomO2p,atomO3p,atomO4,atomO4p,atomO5p,atomO6,atomOP1,atomOP2,atomOP3
0,0,28,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,0,0,0,1,0,0
1,1,29,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,0,0,0,0,1,0
2,2,30,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,0,1,0,0,0,0
3,3,31,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,0,0,1,0,0,0,0,0
4,4,32,1,356,1,2.3,21,1,-73.3,174.9,...,0,0,1,0,0,0,0,0,0,0


In [75]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
clf.fit(X_train, y_train)

prediction = clf.predict(X_test)

In [76]:
f1_score(y_test, prediction)

0.6043876567020251

In [77]:
y_ans = clf.predict(encoded_data_test.iloc[:, 2: ])

In [78]:
len(prediction)

68985

In [79]:
len(np.argwhere(prediction == 1))

7326

In [64]:
clf_50 = AdaBoostClassifier()
clf_50.fit(X_train, y_train)
prediction = clf_50.predict(X_test)

In [65]:
f1_score(y_test, prediction)

0.028742263715514527

In [66]:
y_ans = clf_50.predict(encoded_data_test.iloc[:, 2: ])

In [80]:
len(np.argwhere(y_ans == 1))

0

In [81]:
len(y_ans)

4045

In [69]:
write_ans(y_ans)

4045
4045
