In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
stop_words=stopwords.words('english')
import string

from sklearn.model_selection import train_test_split,KFold
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb

from sklearn.cluster import KMeans

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

import warnings
warnings.filterwarnings('ignore')

import gc
gc.collect()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train['drug_approved_by_UIC'] = pd.to_datetime(train['drug_approved_by_UIC'],format='%d-%b-%y')
test['drug_approved_by_UIC'] = pd.to_datetime(test['drug_approved_by_UIC'],format='%d-%b-%y')

In [3]:
train.head()

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,8.022969
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,7.858458
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17,6.341969
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37,6.590176
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,2015-11-28,43,6.144782


In [4]:
test.head()

Unnamed: 0,patient_id,name_of_drug,review_by_patient,drug_approved_by_UIC,number_of_times_prescribed,use_case_for_drug,effectiveness_rating
0,163740,Mirtazapine,"""I&#039;ve tried a few antidepressants over th...",2012-02-28,22,Depression,10
1,39293,Contrave,"""Contrave combines drugs that were used for al...",2017-03-05,35,Weight Loss,9
2,208087,Zyclara,"""4 days in on first 2 weeks. Using on arms an...",2014-07-03,13,Keratosis,4
3,23295,Methadone,"""Ive been on Methadone for over ten years and ...",2016-10-18,21,Opiate Withdrawal,7
4,97013,Ambien,"""Ditto on rebound sleepless when discontinued....",2015-01-13,44,Insomnia,2


In [5]:
train['type']='train'
test['type']='test'
data = pd.concat([train,test])

data['total_prescribed'] = data.groupby(['name_of_drug'])['number_of_times_prescribed'].transform('sum')
data['total_pateint'] = data.groupby(['name_of_drug'])['patient_id'].transform('nunique')
data['avg_prescription'] = data['total_prescribed']/data['total_pateint']
data['prescribedByAverage'] = data['number_of_times_prescribed']/train['avg_prescription']
data['age'] = 2019-data['drug_approved_by_UIC'].dt.year
data['year'] = data['drug_approved_by_UIC'].dt.year

train = data[data['type']=='train']
test = data[data['type']=='train']

test.drop(['type'],axis=1,inplace=True)
train.drop(['type'],axis=1,inplace=True)

In [6]:
def text_stat(df):
    
    df['review_by_patient_len'] = df['review_by_patient'].apply(len)
    
    def word_count(x):
        review_by_patient = [t for t in x.split()]
        t = [a for a in review_by_patient if a.isalpha()]
        return len(t)
    df['word_count'] = df['review_by_patient'].apply(lambda x: word_count(x))
    
    df['stop_word_count'] = df['review_by_patient'].apply(lambda x: len([t for t in str(x).split() if t.lower() in stop_words]))

    df['caps_count'] = df['review_by_patient'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
    
    df['caps_ratio'] = df['caps_count'] / df['review_by_patient_len']
    
    return df
                                        
                                             
train = text_stat(train)
test = text_stat(test)

train['word_len_mean'] = train['review_by_patient_len']/(1+train['word_count'])
test['word_len_mean'] = test['review_by_patient_len']/(1+test['word_count'])

In [7]:
def text_clean(text):
    text = text.lower()
    text = re.sub(r'\n',' ', text) # Remove line breaks
    text = re.sub('\s+', ' ', text).strip() # Remove leading, trailing, and extra spaces
    text = re.sub('[~!@#$\'%&*()-?=]+','',text)
    text = [t for t in text.split() if len(t)>2]
    text= " ".join(text)
    return text

train['clean_review'] = train['review_by_patient'].apply(lambda x: text_clean(x))
test['clean_review'] = test['review_by_patient'].apply(lambda x: text_clean(x))

In [8]:
# Using Glove 50d for sentence vectors as feature of dimension 50
embeddings_index = {}
f = open(os.path.expanduser('~/Desktop/glove.6B/glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(50)
    return v / np.sqrt((v ** 2).sum())

train['embed'] = train['clean_review'].apply(lambda x: sent2vec(x))
test['embed'] = test['clean_review'].apply(lambda x: sent2vec(x))

temp = train.embed.apply(pd.Series)
temp['patient_id'] = train.patient_id.values
train = pd.merge(train,temp,on='patient_id',how='left')
train.drop(['embed'],axis=1,inplace=True)

temp = test.embed.apply(pd.Series)
temp['patient_id'] = test.patient_id.values
test = pd.merge(test,temp,on='patient_id',how='left')
test.drop(['embed'],axis=1,inplace=True)

In [9]:
df_drugname = train.groupby(['name_of_drug'])['base_score','effectiveness_rating','number_of_times_prescribed'].agg('mean').reset_index()
df_drugname.rename(columns={'base_score':'base_score_drugname_mean',\
                           'effectiveness_rating':'effectiveness_rating_drugname_mean',\
                           'number_of_times_prescribed':'number_of_times_prescribed_drugname_mean'},inplace=True)


df_druguse = train.groupby(['use_case_for_drug'])['base_score','effectiveness_rating','number_of_times_prescribed'].agg('mean').reset_index()
df_druguse.rename(columns={'base_score':'base_score_druguse_mean',\
                           'effectiveness_rating':'effectiveness_rating_druguse_mean',\
                           'number_of_times_prescribed':'number_of_times_prescribed_druguse_mean'},inplace=True)

df_drugnameuse = train.groupby(['name_of_drug','use_case_for_drug'])['base_score','effectiveness_rating','number_of_times_prescribed'].agg('mean').reset_index()
df_drugnameuse.rename(columns={'base_score':'base_score_drugnameuse_mean',\
                           'effectiveness_rating':'effectiveness_rating_drugnameuse_mean',\
                           'number_of_times_prescribed':'number_of_times_prescribed_drugnameuse_mean'},inplace=True)



train=pd.merge(train,df_drugname,on='name_of_drug',how='left')
test=pd.merge(test,df_drugname,on='name_of_drug',how='left')

train=pd.merge(train,df_druguse,on='use_case_for_drug',how='left')
test=pd.merge(test,df_druguse,on='use_case_for_drug',how='left')

train=pd.merge(train,df_drugnameuse,on=['name_of_drug','use_case_for_drug'],how='left')
test=pd.merge(test,df_drugnameuse,on=['name_of_drug','use_case_for_drug'],how='left')

In [10]:
df_drugname = train.groupby(['name_of_drug'])['base_score','effectiveness_rating','number_of_times_prescribed'].agg('std').reset_index()
df_drugname.rename(columns={'base_score':'base_score_drugname_std',\
                           'effectiveness_rating':'effectiveness_rating_drugname_std',\
                           'number_of_times_prescribed':'number_of_times_prescribed_drugname_std'},inplace=True)


df_druguse = train.groupby(['use_case_for_drug'])['base_score','effectiveness_rating','number_of_times_prescribed'].agg('std').reset_index()
df_druguse.rename(columns={'base_score':'base_score_druguse_std',\
                           'effectiveness_rating':'effectiveness_rating_druguse_std',\
                           'number_of_times_prescribed':'number_of_times_prescribed_druguse_std'},inplace=True)

df_drugnameuse = train.groupby(['name_of_drug','use_case_for_drug'])['base_score','effectiveness_rating','number_of_times_prescribed'].agg('std').reset_index()
df_drugnameuse.rename(columns={'base_score':'base_score_drugnameuse_std',\
                           'effectiveness_rating':'effectiveness_rating_drugnameuse_std',\
                           'number_of_times_prescribed':'number_of_times_prescribed_drugnameuse_std'},inplace=True)



train=pd.merge(train,df_drugname,on='name_of_drug',how='left')
test=pd.merge(test,df_drugname,on='name_of_drug',how='left')

train=pd.merge(train,df_druguse,on='use_case_for_drug',how='left')
test=pd.merge(test,df_druguse,on='use_case_for_drug',how='left')

train=pd.merge(train,df_drugnameuse,on=['name_of_drug','use_case_for_drug'],how='left')
test=pd.merge(test,df_drugnameuse,on=['name_of_drug','use_case_for_drug'],how='left')

In [11]:
# # dummies for drugname and druguse
# train['type']='train'
# test['type']='test'

# data = pd.concat([train,test])
# # data['count_drug'] = data.groupby(['name_of_drug'])['name_of_drug'].transform('count')
# # data['name_of_drug'] = np.where(data['count_drug']<=5,'others',data['name_of_drug'])
# # data.head()
# data = pd.get_dummies(data,columns=['name_of_drug'])

# train = data[data['type']=='train']
# test = data[data['type']=='test']

# train.drop(['type'],axis=1,inplace=True)
# test.drop(['type'],axis=1,inplace=True)

In [12]:
#Using K-means from Tf-IDF vectorization as new features

train['type']='train'
test['type']='test'
data = pd.concat([train,test])
data.shape,train.shape,test.shape

text = []
for t in data.clean_review:
    text.append(t)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text)

k=5
model = KMeans(n_clusters=k, init='k-means++', max_iter=200, n_init=1)
model.fit(X)

# cg['tf_idf_vector'] = cg['text_clean'].apply(lambda x: vectorizer.transform([x]))
data['tf_idf_cluster'] = data['clean_review'].apply(lambda x: model.predict(vectorizer.transform([x]))[0])
data=pd.get_dummies(data,columns=['tf_idf_cluster'])

train=data[data['type']=='train']
test=data[data['type']=='test']

In [45]:
cols = ['base_score','drug_approved_by_UIC','name_of_drug','use_case_for_drug','review_by_patient', 'clean_review','patient_id',
        'review_by_patient_len','stop_word_count','punct_count','word_count','caps_count','caps_ratio','age',
        'base_score_drugname_mean','effectiveness_rating_drugname_mean','number_of_times_prescribed_drugname_mean',
        'base_score_drugname_std','effectiveness_rating_drugname_std','number_of_times_prescribed_drugname_std',
        'base_score_druguse_mean','effectiveness_rating_druguse_mean','number_of_times_prescribed_druguse_mean',
        'base_score_druguse_std','effectiveness_rating_druguse_std','number_of_times_prescribed_druguse_std',
        'base_score_drugnameuse_mean','effectiveness_rating_drugnameuse_mean','number_of_times_prescribed_drugnameuse_mean',
        'base_score_drugnameuse_std','effectiveness_rating_drugnameuse_std','number_of_times_prescribed_drugnameuse_std',
        'total_prescribed','total_pateint','avg_prescription','word_len_mean','prescribedByAverage',
       'type']


X = train[[col for col in train.columns if col not in cols]]

y = train.base_score
test_ = test[X.columns]


xtrain, xvalid, ytrain, yvalid = train_test_split(X, y,
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)


lgb = LGBMRegressor(
    num_leaves=20,
    max_depth=10,
    learning_rate=0.3,
    n_estimators=5000,
    min_child_samples=20,
    colsample_bytree=1,
    n_jobs=-1)


lgb.fit(xtrain,ytrain)

print (100*(1-np.sqrt(mean_squared_error(yvalid, lgb.predict(xvalid)))))
predictions = lgb.predict(test_)

92.81143246648249


In [46]:
X.head()

Unnamed: 0,effectiveness_rating,number_of_times_prescribed
0,9,27
1,8,192
2,5,17
3,9,37
4,2,43


In [15]:
temp=test.copy()
temp.index=test['patient_id']
temp['base_score']=predictions
temp=temp[['base_score']]
temp.to_csv("try16.csv")
temp.head()

Unnamed: 0_level_0,base_score
patient_id,Unnamed: 1_level_1
206461,8.023459
95260,7.857722
92703,6.337311
35696,6.590613
155963,6.151426


In [16]:
# from keras import *
# from keras.layers import Dense,Dropout,BatchNormalization,Activation
# from keras.wrappers.scikit_learn import KerasRegressor
# from sklearn.model_selection import cross_val_score
# from keras.callbacks import EarlyStopping,ModelCheckpoint
# from keras import backend as K



# train_stats = X.describe()
# train_stats = train_stats.transpose()

# def norm(x):
#   return (x - train_stats['mean']) / train_stats['std']
# normed_train_data = norm(X)

# def root_mean_squared_error(y_true, y_pred):
#         return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))


# def keras_model(): 
#     model = Sequential()

#     model.add(Dense(100, input_shape=[len(X.columns)], activation='relu'))
# #     model.add(Dropout(0.2))
#     model.add(BatchNormalization())

#     model.add(Dense(200, activation='relu'))
# #     model.add(Dropout(0.3))
#     model.add(BatchNormalization())

#     model.add(Dense(1,activation='linear'))

#     model.compile(loss=root_mean_squared_error, optimizer='rmsprop')
#     return model

# model = keras_model()
# history = model.fit(normed_train_data, y,
#                     epochs=1000, 
#                     batch_size=100,
#                     verbose=1)