In [203]:
#Importing the libraries
import nltk
from nltk.corpus import stopwords
from textblob import Word
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [158]:
#Importing the train and test Datasets
train_dataset=pd.read_csv(r'C:\Users\Sonu\Downloads\datasetc062cf9\dataset\train.csv',encoding='ISO-8859-1')
test_dataset=pd.read_csv(r'C:\Users\Sonu\Downloads\datasetc062cf9\dataset\test.csv',encoding= 'ISO-8859-1')

In [159]:
#Taking a look at the number of rows and columns of the datasets
train_dataset.shape

(1157, 9)

In [160]:
test_dataset.shape

(571, 10)

In [161]:
#Exploring the column namespresent in the datasets
train_dataset.columns

Index(['Source', 'Host', 'Link', 'Date(ET)', 'Time(ET)', 'time(GMT)', 'Title',
       'TRANS_CONV_TEXT', 'Patient_Tag'],
      dtype='object')

In [162]:
test_dataset.columns

Index(['Index', 'Source', 'Host', 'Link', 'Date(ET)', 'Time(ET)', 'time(GMT)',
       'Title', 'TRANS_CONV_TEXT', 'Unnamed: 9'],
      dtype='object')

In [163]:
test_dataset[['Index','Unnamed: 9']].head()

Unnamed: 0,Index,Unnamed: 9
0,1,
1,2,
2,3,
3,4,
4,5,


In [164]:
#Dropping the not so required columns 'Index' and 'Unnamed: 9' as they are not giving us any specific relevant information
test_dataset.drop(columns= ['Index','Unnamed: 9'],inplace=True)

In [165]:
test_dataset.columns

Index(['Source', 'Host', 'Link', 'Date(ET)', 'Time(ET)', 'time(GMT)', 'Title',
       'TRANS_CONV_TEXT'],
      dtype='object')

In [166]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 9 columns):
Source             1157 non-null object
Host               1098 non-null object
Link               1157 non-null object
Date(ET)           1157 non-null object
Time(ET)           1157 non-null object
time(GMT)          996 non-null object
Title              941 non-null object
TRANS_CONV_TEXT    1156 non-null object
Patient_Tag        1157 non-null int64
dtypes: int64(1), object(8)
memory usage: 81.4+ KB


In [167]:
train_dataset.isnull().sum()

Source               0
Host                59
Link                 0
Date(ET)             0
Time(ET)             0
time(GMT)          161
Title              216
TRANS_CONV_TEXT      1
Patient_Tag          0
dtype: int64

In [168]:
#For the single missing value in TRANS_CONV_TEXT, we apply imputation by mode to fill the value in
train_dataset['TRANS_CONV_TEXT'] = train_dataset['TRANS_CONV_TEXT'].fillna(train_dataset['TRANS_CONV_TEXT'].mode()[0])

In [169]:
#Verifying no missing values in TRANS_CONV_TEXT
train_dataset.isnull().sum()

Source               0
Host                59
Link                 0
Date(ET)             0
Time(ET)             0
time(GMT)          161
Title              216
TRANS_CONV_TEXT      0
Patient_Tag          0
dtype: int64

In [170]:
#Extracting the values of  TRANS_CONV_TEXT and Patient_Tag and separating them out in a Dataframe
New_train= train_dataset[['TRANS_CONV_TEXT','Patient_Tag']]
New_test= test_dataset[['TRANS_CONV_TEXT']]

In [171]:
New_train.shape

(1157, 2)

In [172]:
New_test.shape

(571, 1)

In [173]:
New_train['TRANS_CONV_TEXT'].apply(len).describe()

count     1157.000000
mean      1849.922213
std       2324.023070
min          2.000000
25%        379.000000
50%        963.000000
75%       2441.000000
max      16000.000000
Name: TRANS_CONV_TEXT, dtype: float64

In [174]:
New_test['TRANS_CONV_TEXT'].apply(len).describe()

count      571.000000
mean      1851.010508
std       2399.454322
min          3.000000
25%        391.000000
50%        971.000000
75%       2530.000000
max      16000.000000
Name: TRANS_CONV_TEXT, dtype: float64

In [175]:
#Exploring distribution of Patient Tag
New_train['Patient_Tag'].value_counts()

0    917
1    240
Name: Patient_Tag, dtype: int64

In [217]:
#Splitting the train into another train and validation sets in 70:30 ratio respectively

X_train, X_valid, y_train, y_valid = train_test_split(New_train['TRANS_CONV_TEXT'], New_train['Patient_Tag'], \
                                                    test_size=0.3, random_state=42)

In [218]:
#Converting to lower case

New_train['TRANS_CONV_TEXT']=New_train['TRANS_CONV_TEXT'].str.lower()

In [178]:
#Lemmatization

New_train['TRANS_CONV_TEXT']=New_train['TRANS_CONV_TEXT'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()] ))

In [180]:
#Removal of digits

from string import digits

def remove_digits(s: str) -> str:
    remove_digits = str.maketrans('', '', digits)
    res = s.translate(remove_digits)
    return res

In [181]:
X_train = X_train.apply(remove_digits)

In [220]:
#Applying count vectorizer on the text

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=None,
                             ngram_range=(1, 1), min_df=2, max_df=0.4, binary=True)

train_features = vectorizer.fit_transform(X_train)
train_labels = y_train

valid_features = vectorizer.transform(X_valid)
valid_labels = y_valid

In [188]:
#Applying Logistic regression
model = LogisticRegression()
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

             precision    recall  f1-score   support

          0       0.94      0.96      0.95       274
          1       0.82      0.76      0.79        74

avg / total       0.91      0.91      0.91       348

Accuracy:0.9137931034482759


In [202]:
#Applying XGBoost
model = XGBClassifier()
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

             precision    recall  f1-score   support

          0       0.93      0.93      0.93       274
          1       0.75      0.74      0.75        74

avg / total       0.89      0.89      0.89       348

Accuracy:0.8936781609195402


In [219]:
#Applying Bernoulli's Naive Bayes
model = BernoulliNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

             precision    recall  f1-score   support

          0       0.97      0.90      0.93       274
          1       0.70      0.89      0.79        74

avg / total       0.91      0.90      0.90       348

Accuracy:0.896551724137931


In [204]:
#Applying Random Forest Classifier
model = RandomForestClassifier()
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

             precision    recall  f1-score   support

          0       0.88      0.95      0.91       274
          1       0.73      0.54      0.62        74

avg / total       0.85      0.86      0.85       348

Accuracy:0.8591954022988506


In [206]:
#To make predictions and prepare the submissions file 
#Repeating the pre processing steps on the test dataset
New_test['TRANS_CONV_TEXT'] = New_test['TRANS_CONV_TEXT'].apply(remove_digits)

New_test['TRANS_CONV_TEXT'] = New_test['TRANS_CONV_TEXT'].str.lower()

New_test['TRANS_CONV_TEXT'] = New_test['TRANS_CONV_TEXT'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()] ))


In [208]:
test_features = vectorizer.transform(New_test['TRANS_CONV_TEXT'])

In [209]:
test_preds = model.predict(test_features)

In [211]:
test_for_submission = pd.read_csv(r'C:\Users\Sonu\Downloads\datasetc062cf9\dataset\test.csv',encoding= 'ISO-8859-1')

In [212]:
submission = pd.DataFrame()
submission['Index'] = test_for_submission['Index']
submission['Patient_Tag'] = test_preds

submission.to_csv('submission.csv',index=False)

In [222]:
submission.head(15)

Unnamed: 0,Index,Patient_Tag
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,1
8,9,0
9,10,0
