In [1]:
import pandas as pd
import numpy as np
import sklearn
import nltk

In [2]:
data=pd.read_csv('train.csv')

In [8]:
data.head(20)

Unnamed: 0,uid,message,label
0,202723,"Hey, can you find me the nearest book store?",0
1,387547,How do your know my name,1
2,80045,So should I eligible for cashback?,0
3,417835,paradeep. 👍,1
4,386038,U r person,1
5,315073,Then i need some details about the examination,1
6,412275,ya. ok,1
7,370460,r u a human? or anything,1
8,125298,Departure City: Goa\nArrival City: Mumbai\nDep...,0
9,195639,required duronto express schedule,0


In [7]:
data.message[3]

'paradeep. 👍'

In [4]:
data.shape

(182234, 3)

In [5]:
data.isnull().sum().sort_values(ascending=False)

label      0
message    0
uid        0
dtype: int64

In [6]:
data['label'].value_counts()# not an unbalanced dataset

1    94404
0    87830
Name: label, dtype: int64

#### Preprocessing of the text data

In [10]:
# remove special characters, numbers, punctuations
data['message_pre'] = data['message'].str.replace("[^a-zA-Z#]", " ")
data.head()

Unnamed: 0,uid,message,label,message_pre
0,202723,"Hey, can you find me the nearest book store?",0,Hey can you find me the nearest book store
1,387547,How do your know my name,1,How do your know my name
2,80045,So should I eligible for cashback?,0,So should I eligible for cashback
3,417835,paradeep. 👍,1,paradeep
4,386038,U r person,1,U r person


In [13]:
data['message_pre']=data['message_pre'].apply(lambda x: x.split(' '))

In [14]:
data.head()

Unnamed: 0,uid,message,label,message_pre
0,202723,"Hey, can you find me the nearest book store?",0,"[Hey, , can, you, find, me, the, nearest, book..."
1,387547,How do your know my name,1,"[How, do, your, know, my, name]"
2,80045,So should I eligible for cashback?,0,"[So, should, I, eligible, for, cashback, ]"
3,417835,paradeep. 👍,1,"[paradeep, , , ]"
4,386038,U r person,1,"[U, r, person]"


In [15]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
data['message_pre']=data['message_pre'].apply(lambda x: [stemmer.stem(i) for i in x])

In [16]:
data.head()

Unnamed: 0,uid,message,label,message_pre
0,202723,"Hey, can you find me the nearest book store?",0,"[hey, , can, you, find, me, the, nearest, book..."
1,387547,How do your know my name,1,"[how, do, your, know, my, name]"
2,80045,So should I eligible for cashback?,0,"[So, should, I, elig, for, cashback, ]"
3,417835,paradeep. 👍,1,"[paradeep, , , ]"
4,386038,U r person,1,"[U, r, person]"


In [18]:
data['message_pre']=data['message_pre'].apply(lambda x: ' '.join(x))

In [19]:
data.head()

Unnamed: 0,uid,message,label,message_pre
0,202723,"Hey, can you find me the nearest book store?",0,hey can you find me the nearest book store
1,387547,How do your know my name,1,how do your know my name
2,80045,So should I eligible for cashback?,0,So should I elig for cashback
3,417835,paradeep. 👍,1,paradeep
4,386038,U r person,1,U r person


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(data['message_pre'])

In [24]:
tfidf.shape


(182234, 1000)

In [30]:
tfidf.shape[0]*0.80,tfidf.shape[0]*0.20

(145787.2, 36446.8)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


In [32]:
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(tfidf, data['label'], random_state=42, test_size=0.2)

In [34]:
xtrain_bow.shape, ytrain.shape

((145787, 1000), (145787,))

In [35]:
xvalid_bow.shape,yvalid.shape

((36447, 1000), (36447,))

In [36]:
lreg = LogisticRegression()

In [43]:
lreg.fit(xtrain_bow, ytrain)

prediction = lreg.predict_proba(xvalid_bow)
prediction_int = prediction[:,1] >= 0.5
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

0.8694170133841295

#### Accuracy varying with threshold


0.8685258964143426  - 0.4

0.8619169510807736 -0.3

0.8694170133841295  -0.5


In [47]:
yvalid.value_counts()

1    18968
0    17479
Name: label, dtype: int64

In [50]:
pd.Series(prediction_int).value_counts()

1    20930
0    15517
dtype: int64

In [51]:
from sklearn.metrics import confusion_matrix
confusion_matrix(yvalid, prediction_int)

array([[13893,  3586],
       [ 1624, 17344]])