In [None]:
# Libraries
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from xgboost import XGBClassifier
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import regex as re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [4]:
#Load the dataset
df = pd.read_csv('medicine_review.csv')

In [5]:
df.head(10)

Unnamed: 0,review,label
0,"Amazing results, I feel so much better now.",negative
1,Best medicine I have ever used!,negative
2,The side effects were unbearable.,positive
3,Highly recommend this to anyone in need.,negative
4,Didn't notice any improvement in my health.,negative
5,"I would never take this again, terrible experi...",positive
6,"Waste of money, did nothing for my condition.",positive
7,"Not worth the hype, didn’t work for me.",negative
8,Very effective and quick results!,negative
9,"Amazing results, I feel so much better now.",positive


In [7]:
df.review[0]

'Amazing results, I feel so much better now.'

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  15000 non-null  object
 1   label   15000 non-null  object
dtypes: object(2)
memory usage: 234.5+ KB


### Text Data Cleaning

In [None]:
import string
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))
exclude = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
df.columns

Index(['review', 'label'], dtype='object')

In [12]:
# removing the punctuation
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [13]:
# removing stopwords
def removing_stopwords(text):
    data = [word for word in text.split() if word not in stop_words]
    return " ".join(data)

In [15]:
# reducing a word to its root form, or lemma, by removing prefixes and suffixes
# nltk.download('wordnet')
Lemma = nltk.WordNetLemmatizer()

def apply_lemma(text):
    data = [Lemma.lemmatize(word, pos='v') for word in text.split()]
    return " ".join(data)


In [16]:
def text_data_cleaning(data):
    data['original_review'] = data['review'].copy()
    data['review'] = data['review'].str.lower()
    data['review'] = data['review'].apply(remove_punc)
    data['review'] = data['review'].apply(removing_stopwords)
    data['review'] = data['review'].apply(apply_lemma)
    return data

In [17]:
df = text_data_cleaning(df)

In [19]:
df

Unnamed: 0,review,label,original_review
0,amaze result feel much better,negative,"Amazing results, I feel so much better now."
1,best medicine ever use,negative,Best medicine I have ever used!
2,side effect unbearable,positive,The side effects were unbearable.
3,highly recommend anyone need,negative,Highly recommend this to anyone in need.
4,didnt notice improvement health,negative,Didn't notice any improvement in my health.
...,...,...,...
14995,amaze result feel much better,negative,"Amazing results, I feel so much better now."
14996,amaze result feel much better,positive,"Amazing results, I feel so much better now."
14997,didnt notice improvement health,negative,Didn't notice any improvement in my health.
14998,i’m extremely satisfy result,positive,I’m extremely satisfied with the results.


In [20]:
df['original_review'][0]

'Amazing results, I feel so much better now.'

In [21]:
df['review'][0]

'amaze result feel much better'

In [22]:
df['label'] = df['label'].map({'negative':0, 'positive':1})

In [23]:
df.label.value_counts()

label
0    7571
1    7429
Name: count, dtype: int64

In [24]:
df

Unnamed: 0,review,label,original_review
0,amaze result feel much better,0,"Amazing results, I feel so much better now."
1,best medicine ever use,0,Best medicine I have ever used!
2,side effect unbearable,1,The side effects were unbearable.
3,highly recommend anyone need,0,Highly recommend this to anyone in need.
4,didnt notice improvement health,0,Didn't notice any improvement in my health.
...,...,...,...
14995,amaze result feel much better,0,"Amazing results, I feel so much better now."
14996,amaze result feel much better,1,"Amazing results, I feel so much better now."
14997,didnt notice improvement health,0,Didn't notice any improvement in my health.
14998,i’m extremely satisfy result,1,I’m extremely satisfied with the results.


In [25]:

from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features_fit(df):
    tfidf = TfidfVectorizer(min_df=0.01, max_df=0.1)
    tfidf_matrix = tfidf.fit_transform(df['review'])  
    return tfidf, tfidf_matrix.toarray()


def tfidf_features_transform(tfidf, df):
    tfidf_matrix = tfidf.transform(df['review'])  
    return tfidf_matrix.toarray()


In [26]:
df.reset_index(drop=True,inplace=True)
data_train =df[0:10000]
data_test = df[10000:]

tfidf,data_train_matrix = tfidf_features_fit(data_train)
features = tfidf.get_feature_names_out()

In [27]:
data_train

Unnamed: 0,review,label,original_review
0,amaze result feel much better,0,"Amazing results, I feel so much better now."
1,best medicine ever use,0,Best medicine I have ever used!
2,side effect unbearable,1,The side effects were unbearable.
3,highly recommend anyone need,0,Highly recommend this to anyone in need.
4,didnt notice improvement health,0,Didn't notice any improvement in my health.
...,...,...,...
9995,work well symptoms,1,It worked well for my symptoms.
9996,side effect unbearable,0,The side effects were unbearable.
9997,help,0,This did not help me at all.
9998,experience severe side effect take,1,I experienced severe side effects after taking...


In [28]:
data_test

Unnamed: 0,review,label,original_review
10000,would never take terrible experience,1,"I would never take this again, terrible experi..."
10001,medicine work wonder,1,This medicine worked wonders for me.
10002,amaze result feel much better,0,"Amazing results, I feel so much better now."
10003,i’m extremely satisfy result,0,I’m extremely satisfied with the results.
10004,experience severe side effect take,0,I experienced severe side effects after taking...
...,...,...,...
14995,amaze result feel much better,0,"Amazing results, I feel so much better now."
14996,amaze result feel much better,1,"Amazing results, I feel so much better now."
14997,didnt notice improvement health,0,Didn't notice any improvement in my health.
14998,i’m extremely satisfy result,1,I’m extremely satisfied with the results.


In [29]:
features

array(['amaze', 'anyone', 'best', 'better', 'condition', 'didn', 'didnt',
       'ever', 'extremely', 'feel', 'health', 'help', 'highly', 'hop',
       'hype', 'improvement', 'money', 'much', 'need', 'never', 'nothing',
       'notice', 'okay', 'quick', 'recommend', 'satisfy', 'severe',
       'symptoms', 'terrible', 'unbearable', 'use', 'waste', 'well',
       'wonder', 'worth', 'would'], dtype=object)

In [30]:
len(features)

36

In [31]:
data_train_matrix[0]

array([0.5, 0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])

In [32]:
data_test_matrix = tfidf_features_transform(tfidf, data_test)

In [33]:
from sklearn.model_selection import train_test_split as sk_train_test_split

def train_test_split(data):
    y = data['label']
    X = data.drop(columns=['label'])
    
    # Using sklearn's train_test_split for simplicity
    X_train, X_test, y_train, y_test = sk_train_test_split(X, y, test_size=0.2, random_state=0)
    
    return X_train, X_test, y_train, y_test

x_train, x_test, y_train, y_test = train_test_split(data_train)


### Prepare data for model training

In [34]:
def train_test_split(data_train,
                    data_test,\
                    data_train_matrix,
                    data_test_matrix):
    
    y_train = data_train['label']
    y_test = data_test['label']

    x_train = data_train_matrix.copy()
    x_test =data_test_matrix.copy()
    
    return x_train, x_test, y_train, y_test

In [35]:
x_train, x_test, y_train, y_test = train_test_split(data_train, data_test, data_train_matrix, data_test_matrix)

### Fit and evaluate the model

In [36]:
def fit_and_evaluate_model(x_train, x_test, y_train, y_test):

    xgb = XGBClassifier(random_state=0)
    xgb.fit(x_train, y_train)
    xgb_predict = xgb.predict(x_test)
    xgb_conf_matrix = confusion_matrix(y_test, xgb_predict)
    xgb_acc_score =accuracy_score(y_test, xgb_predict)
    print("confusion matrix")
    print(xgb_conf_matrix)
    print("\n")
    print("Accuracy of XGBoost:",xgb_acc_score*100,'\n')
    print(classification_report(y_test,xgb_predict))
    return xgb

In [38]:
model =fit_and_evaluate_model(x_train, x_test, y_train, y_test)

confusion matrix
[[1476 1012]
 [1532  980]]


Accuracy of XGBoost: 49.120000000000005 

              precision    recall  f1-score   support

           0       0.49      0.59      0.54      2488
           1       0.49      0.39      0.44      2512

    accuracy                           0.49      5000
   macro avg       0.49      0.49      0.49      5000
weighted avg       0.49      0.49      0.49      5000



In [39]:
from sklearn.model_selection import GridSearchCV 
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'alpha': [0, 0.1],
    'lambda': [1, 10]
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [40]:
# Best parameters from GridSearchCV
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the model with the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# Print classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best parameters found:  {'alpha': 0, 'colsample_bytree': 0.8, 'lambda': 10, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.93      0.65      2488
           1       0.50      0.07      0.12      2512

    accuracy                           0.50      5000
   macro avg       0.50      0.50      0.38      5000
weighted avg       0.50      0.50      0.38      5000



### get feature importance for xgboost

In [41]:
def get_importance_features(model, features):
    importances = pd.DataFrame(model.feature_importances_)
    importances['features'] = features
    importances.columns=['importance','feature']
    importances.sort_values(by='importance',ascending=False,inplace=True)
    return importances

In [42]:
features_importance = get_importance_features(model, features)

In [43]:
features_importance.head(10)

Unnamed: 0,importance,feature
8,0.357434,extremely
33,0.237448,wonder
0,0.096129,amaze
6,0.087579,didnt
5,0.076956,didn
4,0.075835,condition
27,0.02115,symptoms
19,0.018438,never
11,0.010804,help
2,0.009053,best


In [44]:
features_importance.tail(10)

Unnamed: 0,importance,feature
22,0.0,okay
20,0.0,nothing
24,0.0,recommend
25,0.0,satisfy
30,0.0,use
28,0.0,terrible
31,0.0,waste
32,0.0,well
34,0.0,worth
35,0.0,would
