In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df=pd.read_csv('drugsComTrain_raw.tsv',sep='\t')
df2=pd.read_csv('drugsComTest_raw.tsv',sep='\t')
df = df.append(df2, ignore_index=True, sort=False)

In [3]:
df['date']=df['date'].astype('datetime64[ns]')
df = df.applymap(lambda s: s.lower() if type(s) == str else s) #Tüm karakterler küçültülür
df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
df_temp=df[df.condition.str.contains("</span>") == True]
df_temp.condition='NAN'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [5]:
df.update(df_temp)

In [6]:
df.condition=df.condition.apply(lambda x: np.nan if x=='NAN' else x)

In [7]:
drug_condition_dict=dict()
all_drugName=df['drugName'].unique().tolist()

In [8]:
# sözlüğün key değerleri ilaç isimleri, value değerleri ise bu ilacın sahip olduğu unique conditionlar nan değerler olduğu
# için nan değerleri silecek şekilde hesaplamalar yapıldı
for i in all_drugName:
    drug_condition_dict[i]=df[df['drugName']==i]['condition'].unique()\
    [pd.notna(np.array(df[df['drugName']==i]['condition'].unique(),dtype=object))]

In [9]:
df.shape

(215063, 6)

## conditionı olmayan 17 tane ilaç veri setinden silinecek

In [10]:
non_condition=[k for k,v in drug_condition_dict.items() if len(v)==0]

In [11]:
index_non_condition=list()

In [12]:
for i in non_condition:
    index_non_condition.append(df[df['drugName']==i].index[0])

In [13]:
df.drop(index_non_condition,inplace=True)

In [14]:
df.shape

(215046, 6)

## conditionı 1 olan ilaçların bütün condition değerleri aynı olacak

In [15]:
aranacak=df[df["condition"].isna()==True][['drugName','condition','review']]
aranacak['review']=aranacak['review'].str.strip('"')

In [16]:
onecon_drugs = [k for k,v in drug_condition_dict.items() if len(v)==1]

In [17]:
for i in aranacak.iloc:
    if i[0] in onecon_drugs and pd.isna(i[1]):
        i[1] = drug_condition_dict[i[0]][0]

In [18]:
df.update(aranacak)

## conditionu boş olan değerleri review'dan doldurma

In [19]:
import re
from gensim.parsing.preprocessing import remove_stopwords
for i in aranacak.iloc:
    if pd.isna(i[1]):
        cons = drug_condition_dict[i[0]]
        review = re.sub(r'[^\w\s]','',i[2])
        review = remove_stopwords(review)
        for k in cons:
            if k in review:
                if pd.isna(i[1]):
                    i[1] = k

In [20]:
df.update(aranacak)

In [21]:
df.shape

(215046, 6)

## Data Cleaning

In [22]:
import nltk
from nltk.corpus import stopwords
import textblob
from textblob import Word

sw=stopwords.words("english")

In [23]:
df = df.dropna()

In [24]:
df.shape

(213728, 6)

In [25]:
def data_cleaning(df):
    df["review"]=df["review"].str.replace("[^\w\s]","") # noktalama
    df["review"]=df["review"].str.replace("\d","") # sayıların silinmesi
    df["review"]=df["review"].apply(lambda x: " ".join(x for x in x.split() if x not in sw)) # stopwords
    df["review"]=df["review"].apply(lambda x: " ".join(Word(i).lemmatize() for i in x.split())) # lemmatization
    return df['review']

In [26]:
df['review']=data_cleaning(df)

  df["review"]=df["review"].str.replace("[^\w\s]","") # noktalama
  df["review"]=df["review"].str.replace("\d","") # sayıların silinmesi


In [27]:
df.reset_index(inplace=True,drop=True)

In [28]:
# bağımlı değişkeni kategorize etme
df['label']=df['rating'].apply(lambda x: 0 if 0<x<6 else 1)

## Feature Extraction

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
def feature_extraction(df,choice):
    if choice == 1:
        print("TF-IDF yöntemi seçildi")
        vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1, 2),dtype=np.byte,max_features=100000)
        X = vectorizer.fit_transform(df['review'].tolist())
        vocab = vectorizer.get_feature_names()
        return X,vocab
    if choice == 2:
        print("BoW yöntemi seçildi")
        vectorizer=CountVectorizer(lowercase=False,dtype=np.byte,ngram_range=(1,2),max_features=100000)
        X=vectorizer.fit_transform(df['review'].tolist())
        vocab = vectorizer.get_feature_names()
        return X,vocab

In [77]:
X,vocab=feature_extraction(df,2)

BoW yöntemi seçildi


## One-Hot Encoding for unique condition

In [78]:
import scipy
from scipy import hstack
from scipy.sparse import csc_matrix
import scipy.sparse as sp

In [79]:
ohe=pd.get_dummies(df['condition'])

In [80]:
ohe=scipy.sparse.csr_matrix(ohe.values)

In [81]:
X = sp.hstack((X,ohe))

In [82]:
X.shape

(213728, 100836)

In [83]:
ohe=pd.get_dummies(df['drugName'])

In [84]:
ohe=scipy.sparse.csr_matrix(ohe.values)

In [85]:
X = sp.hstack((X,ohe))

In [86]:
X.shape

(213728, 104490)

## Select Percentile

from sklearn.feature_selection import SelectPercentile

# Assume some matrix X and labels y
# 10 means only include the 10% best features
selector = SelectPercentile(percentile=10)

# A feature space with only 10% of the features
X_new = selector.fit_transform(X, df['label'])

# See the scores for all features
X_new

X=X.astype('int8')

## Truncated SVD

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
a=svd.fit_transform(X)

## Machine Learning Models

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
X_train,X_test,y_train,y_test = train_test_split(X,df['label'],random_state=1,test_size=0.25)

In [89]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, roc_auc_score,f1_score,precision_score,recall_score,precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, GridSearchCV
from numpy import mean
from math import sqrt

In [90]:
y_train.value_counts()

1    112305
0     47991
Name: label, dtype: int64

In [91]:
y_test.value_counts()

1    37580
0    15852
Name: label, dtype: int64

## SMOTE

In [46]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy=0.7)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [47]:
y_res.value_counts()

1    112305
0     78613
Name: label, dtype: int64

## ADASYN 

In [92]:
from imblearn.over_sampling import ADASYN

ada = ADASYN(sampling_strategy=0.7)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

In [93]:
y_train_ada.value_counts()

1    112305
0     74507
Name: label, dtype: int64

## SMOTE-TOMEK

In [94]:
from imblearn.combine import SMOTETomek

smtom = SMOTETomek(sampling_strategy=0.7)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)

In [95]:
y_train_smtom.value_counts()

1    112278
0     78586
Name: label, dtype: int64

# Borderline SMOTE

In [96]:
from imblearn.over_sampling import BorderlineSMOTE

bordsmo = BorderlineSMOTE(sampling_strategy=0.7)
X_train_bordsmo, y_train_bordsmo = bordsmo.fit_resample(X_train, y_train)

In [97]:
y_train_bordsmo.value_counts()

1    112305
0     78613
Name: label, dtype: int64

# KMeans SMOTE

from imblearn.over_sampling import KMeansSMOTE

kmeanssmote = KMeansSMOTE(sampling_strategy=0.7,)
X_train_ksmo, y_train_ksmo = kmeanssmote.fit_resample(X_train, y_train)

y_train_ksmo.value_counts()

# SVM SMOTE

from imblearn.over_sampling import SVMSMOTE

svmsmo = SVMSMOTE(sampling_strategy=0.7)
X_train_ssmo, y_train_ssmo = svmsmo.fit_resample(X_train, y_train)

y_train_ssmo.value_counts()

In [None]:
scores=dict()

## Logistic Regression

In [None]:
def LogisticRegression(X_train, y_train,choice=None):
    from sklearn.linear_model import LogisticRegression
    lr=LogisticRegression()
    model_lr=lr.fit(X_train,y_train)
    prediction_lr=model_lr.predict(X_test)
    print("Accuracy Score on traning data: ",lr.score(X_train,y_train))
    print("Accuracy Score on test data: ",lr.score(X_test,y_test))
    cv_score=cross_val_score(lr, X, df['label'], cv=5, scoring='accuracy',n_jobs=-1).mean()
    print('Logistic Regression Cross Validation (K=5): {} \n'.format(cv_score));

    accuracy=accuracy_score(y_test, prediction_lr)
    roc_auc=roc_auc_score(y_test, prediction_lr)
    print(confusion_matrix(y_test, prediction_lr))
    print(classification_report(y_test, prediction_lr))
    print("Accuracy Score: ",accuracy)
    print("Auc Score: ",roc_auc)
    if choice ==1:
        scores['Klasik SMOTE']=[cv_score,accuracy]
        return scores
    elif choice==2:
        scores['ADASYN']=[cv_score,accuracy,roc_auc,mse]
        return scores
    elif choice==3:
        scores['SMOTE-TOMEK']=[cv_score,accuracy,roc_auc,mse]
        return scores
    elif choice==4:
        scores['Borderline SMOTE']=[cv_score,accuracy,roc_auc,mse]
        return scores
    elif choice==5:
        scores['KMeans SMOTE']=[cv_score,accuracy,roc_auc,mse]
        return scores
    elif choice==6:
        scores['SVM SMOTE']=[cv_score,accuracy,roc_auc,mse]
        return scores
    else:
        scores['NO SMOTE']=[cv_score,accuracy,roc_auc,mse]
        return scores

In [None]:
LogisticRegression(X_res, y_res,1)

p=[{'max_iter':range(100,500,100),'tol':[0.0001,0.001,0.01,0.1,0.5],\
    'C':[0.5,1.0,3,7,10],\
    'solver':['newton-cg','lbfgs','liblinear','sag','saga']}]
gs=GridSearchCV(estimator=lr,param_grid=p,scoring='accuracy',cv=5,n_jobs=-1,verbose=1,\
   return_train_score=True)
grid_search=gs.fit(X_train,y_train)
best_score=grid_search.best_score_
best_parameters=grid_search.best_params_
print('Logistic Regression CV=5 İle En İyi Sonuç: ',best_score)
print('Logistic Regression En İyi Parametreler: ',best_parameters)

## Farklı scalerları model üzerinde test etmek

In [57]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [98]:
def scalers(X_train,choice = None):
    if choice==1: # hata
        mas=MaxAbsScaler()
        scaled_X_train=mas.fit_transform(X_train)
        scaled_X_test=mas.transform(X_test)
        return scaled_X_train, scaled_X_test
    elif choice==2: # en iyisi biraz uzun sürüyor 10-15 dk
        rs=RobustScaler(with_centering=False)
        scaled_X_train=rs.fit_transform(X_train)
        scaled_X_test=rs.transform(X_test)
        return scaled_X_train, scaled_X_test
    elif choice==3: # kötü sonuç veriyor (çok kısa)
        ns=Normalizer()
        scaled_X_train=ns.fit_transform(X_train)
        scaled_X_test=ns.transform(X_test)
        return scaled_X_train, scaled_X_test
    elif choice==4:
        qt=QuantileTransformer() # hata
        scaled_X_train=qt.fit_transform(X_train)
        scaled_X_test=qt.transform(X_test)
        return scaled_X_train, scaled_X_test
    else: # kısa sürüyor RobustScaler kadar iyi değil ama kötü de değil
        sc=StandardScaler(with_mean=False)
        scaled_X_train=sc.fit_transform(X_train)
        scaled_X_test=sc.transform(X_test)
        return scaled_X_train, scaled_X_test

In [None]:
scaled_X_train,scaled_X_test=scalers(X_res)

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
def NaiveBayes(y_train):
    # MultinomialNB
    # Tahmin edilecek sınıf nominal ise (çok sınıflı olabilir) Multinomial navie bayes kullanılabilir.
    mnb=MultinomialNB()
    mnb.fit(scaled_X_train,y_train)
    y_pred=mnb.predict(scaled_X_test)
    print("Accuracy Score on traning data",mnb.score(scaled_X_train,y_train))
    print("Accuracy Score on test data",mnb.score(scaled_X_test,y_test))
    print('Naive Bayes MultinomialNB Çekirdeği: {} \n'.format(cross_val_score(mnb, X, df['label'], cv=5, scoring='accuracy').mean()));
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    roc_auc=roc_auc_score(y_test, y_pred)
    print("Accuracy Score: ",accuracy_score(y_test, y_pred))
    print("Auc Score: ",roc_auc)

In [None]:
NaiveBayes(y_res)

## Support Vector Classifier

In [None]:
scaled_X_train,scaled_X_test=scalers(X_res,1)
# TF-IDF için QuantileTransformer
# BoW için MaxAbsScaler

In [None]:
from sklearn import svm

In [None]:
# {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
# gamma{‘scale’, ‘auto’} or float, default=’scale’

In [None]:
def SVM(y_train):
    svc = svm.LinearSVC()
    svc.fit(scaled_X_train,y_train)
    y_pred=svc.predict(scaled_X_test)
    print("Accuracy Score on traning data",svc.score(scaled_X_train,y_train))
    print("Accuracy Score on test data",svc.score(scaled_X_test,y_test))
    print('Linear Support Vector Classifier: {} \n'.format(cross_val_score(svc, X, df['label'], cv=5, scoring='accuracy').mean()));
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("Accuracy Score: ",accuracy_score(y_test, y_pred))
    print("Auc Score: ",roc_auc_score(y_test, y_pred))

In [None]:
SVM(y_res)

## Stochastic Gradient Descent Classifier (SGDClassifier)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ParameterGrid
import parfit.parfit as pf

In [None]:
scores=dict()
def SGD(X_train,y_train,X_test,y_test,choice=None):
    grid = {
        'alpha': [1e-5,1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1], # learning rate
        'max_iter': range(300,1100,200), # number of epochs
        'loss': ['log'], # logistic regression,
        'penalty': ['l2','elasticnet'],
        'n_jobs': [-1]
    }
    paramGrid = ParameterGrid(grid)

    bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier, paramGrid,
               X_train, y_train, X_test, y_test, 
               metric = roc_auc_score,
               scoreLabel = "ROC-AUC EĞRİSİ SKORU")
    if choice ==1:
        scores['Klasik SMOTE']=[bestModel,bestScore]
        return scores
    elif choice==2:
        scores['ADASYN']=[bestModel,bestScore]
        return scores
    elif choice==3:
        scores['SMOTE-TOMEK']=[bestModel,bestScore]
        return scores
    elif choice==4:
        scores['Borderline SMOTE']=[bestModel,bestScore]
        return scores
    elif choice==5:
        scores['KMeans SMOTE']=[bestModel,bestScore]
        return scores
    elif choice==6:
        scores['SVM SMOTE']=[bestModel,bestScore]
        return scores
    else:
        scores['NO SMOTE']=[bestModel,bestScore]
        return scores,bestModel,bestScore
    print(bestModel,bestScore)

In [None]:
SGD_scores,model,b_score=SGD(X_res,y_res,X_test,y_test)

In [None]:
SGD_scores

In [None]:
print(classification_report(y_test,model.predict(X_test)))

In [None]:
b_score

## Perceptron

In [72]:
from sklearn.linear_model import Perceptron

In [73]:
#scaled_X_train,scaled_X_test=scalers(X_train_bordsmo,1) # TF-IDF MaxAbsScaler

In [99]:
def PerceptronClassifier(X_train, y_train):
    pt=Perceptron()
    pt_model=pt.fit(X_train,y_train)
    prediction_pt=pt_model.predict(scaled_X_test)
    print("Accuracy Score on traning data: ",pt.score(X_train,y_train))
    print("Accuracy Score on test data: ",pt.score(scaled_X_test,y_test))
    cv_score=cross_val_score(pt, X, df['label'], cv=5, scoring='accuracy',n_jobs=-1).mean()
    print('Perceptron Cross Validation (K=5): {} \n'.format(cv_score));

    accuracy=accuracy_score(y_test, prediction_pt)
    roc_auc=roc_auc_score(y_test, prediction_pt)
    print(confusion_matrix(y_test, prediction_pt))
    print(classification_report(y_test, prediction_pt))
    print("Accuracy Score: ",accuracy)
    print("Auc Score: ",roc_auc)

In [108]:
PerceptronClassifier(X_train_bordsmo, y_train_bordsmo)

Accuracy Score on traning data:  0.9965796834242974
Accuracy Score on test data:  0.8535896092229376
Perceptron Cross Validation (K=5): 0.9243009940953911 

[[13204  2648]
 [ 5175 32405]]
              precision    recall  f1-score   support

           0       0.72      0.83      0.77     15852
           1       0.92      0.86      0.89     37580

    accuracy                           0.85     53432
   macro avg       0.82      0.85      0.83     53432
weighted avg       0.86      0.85      0.86     53432

Accuracy Score:  0.8535896092229376
Auc Score:  0.8476243027407457


## RidgeClassifier

In [109]:
from sklearn.linear_model import RidgeClassifier

In [110]:
def Ridge(X_train, y_train):
    rc=RidgeClassifier()
    rc_model=rc.fit(X_train,y_train)
    prediction_rc=rc_model.predict(X_test)
    print("Accuracy Score on traning data: ",rc.score(X_train,y_train))
    print("Accuracy Score on test data: ",rc.score(X_test,y_test))
    cv_score=cross_val_score(rc, X, df['label'], cv=5, scoring='accuracy',n_jobs=-1).mean()
    print('RidgeClassifier Cross Validation (K=5): {} \n'.format(cv_score));

    accuracy=accuracy_score(y_test, prediction_rc)
    roc_auc=roc_auc_score(y_test, prediction_rc)
    print(confusion_matrix(y_test, prediction_rc))
    print(classification_report(y_test, prediction_rc))
    print("Accuracy Score: ",accuracy)
    print("Auc Score: ",roc_auc)

In [111]:
Ridge(X_train_bordsmo, y_train_bordsmo)

Accuracy Score on traning data:  0.9921117966875832
Accuracy Score on test data:  0.8805397514597993
RidgeClassifier Cross Validation (K=5): 0.8931258379965952 

[[13317  2535]
 [ 3848 33732]]
              precision    recall  f1-score   support

           0       0.78      0.84      0.81     15852
           1       0.93      0.90      0.91     37580

    accuracy                           0.88     53432
   macro avg       0.85      0.87      0.86     53432
weighted avg       0.88      0.88      0.88     53432

Accuracy Score:  0.8805397514597993
Auc Score:  0.8688441896751982


## Klasik SGDClassifier

In [None]:
def ClassicSGD(X_train, y_train):
    sgd=SGDClassifier()
    sgd_model=sgd.fit(X_train,y_train)
    prediction_sgd=sgd_model.predict(X_test)
    print("Accuracy Score on traning data: ",sgd.score(X_train,y_train))
    print("Accuracy Score on test data: ",sgd.score(X_test,y_test))
    cv_score=cross_val_score(sgd, X, df['label'], cv=5, scoring='accuracy',n_jobs=-1).mean()
    print('RidgeClassifier Cross Validation (K=5): {} \n'.format(cv_score));

    accuracy=accuracy_score(y_test, prediction_sgd)
    roc_auc=roc_auc_score(y_test, prediction_sgd)
    print(confusion_matrix(y_test, prediction_sgd))
    print(classification_report(y_test, prediction_sgd))
    print("Accuracy Score: ",accuracy)
    print("Auc Score: ",roc_auc)

In [None]:
ClassicSGD(X_res,y_res)