In [67]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import re
%matplotlib inline

In [68]:
train  = pd.read_csv('train_F3WbcTw.csv')
test = pd.read_csv('test_tOlRoBf.csv')

In [69]:
# remove special characters, numbers, punctuations
train["mod_text"] = train["text"].str.replace("[^a-zA-Z]", " ")
test["mod_text"] = test["text"].str.replace("[^a-zA-Z]", " ")

In [70]:
#As words has not been used properly in review, we will lemmatize these words
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

tokenized_train_text = train.mod_text.apply(lemmatize_text)
tokenized_test_text = test.mod_text.apply(lemmatize_text)

In [71]:
# Stitch these tokens back together for nex processing
for i in range(len(tokenized_train_text)):
    tokenized_train_text[i] = ' '.join(tokenized_train_text[i])
train["mod_text"] = tokenized_train_text

for i in range(len(tokenized_test_text)):
    tokenized_test_text[i] = ' '.join(tokenized_test_text[i])
test["mod_text"] = tokenized_test_text

In [72]:
#Removing words with less than 3 characters
train["mod_text"] = train["mod_text"].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

test["mod_text"] = test["mod_text"].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [73]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train["mod_text"] = train["mod_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

test["mod_text"] = test["mod_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [74]:
#Bag-of-Words Features
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

# bag-of-words feature matrix
train_bow = bow_vectorizer.fit_transform(train["mod_text"],train["drug"])

test_bow = bow_vectorizer.fit_transform(test["mod_text"],test["drug"])

In [75]:
# TF-IDF feature 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

# TF-IDF featurematrix
train_tfidf = tfidf_vectorizer.fit_transform(train["mod_text"],train["drug"])

test_tfidf = tfidf_vectorizer.fit_transform(test["mod_text"],test["drug"])

In [90]:
#Building model using Bag-of-Words features
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
    
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['sentiment'], random_state=42, test_size=0.3)

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(xtrain_bow, ytrain) # training the model

prediction = model.predict(xvalid_bow) # predicting on the validation set
#prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)
print(f1_score(yvalid, prediction, average='macro')) # calculating f1 score

pred_test = model.predict(test_bow)
submission=pd.read_csv("sample_submission_i5xnIZD.csv")
submission['sentiment']=pred_test
pd.DataFrame(submission, columns=['unique_hash','sentiment']).to_csv('BernoulliNB.csv')



0.42973176294224374
[1 0 1 ... 1 0 1]


In [96]:
#Building model using # TF-IDF feature 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
    
# splitting data into training and validation set
xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['sentiment'], random_state=42, test_size=0.3)

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(xtrain_tfidf, ytrain) # training the model

prediction = model.predict(xvalid_tfidf) # predicting on the validation set
#prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)
print(f1_score(yvalid, prediction, average='macro')) # calculating f1 score

pred_test = model.predict(test_bow)
submission=pd.read_csv("sample_submission_i5xnIZD.csv")
submission['sentiment']=pred_test
pd.DataFrame(submission, columns=['unique_hash','sentiment']).to_csv('BernoulliNB_tfidf.csv')

0.3364451099830968
