In [84]:
import pandas as pd
import spacy
pd.set_option('display.max_columns', None)
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS
import re # regular expression
from sklearn.model_selection import train_test_split

In [85]:
data = pd.read_csv('train.csv')

In [86]:
print ('Shape of train ',data.shape)

Shape of train  (1306122, 3)


In [87]:
print ('Taking a look at Sincere Questions')
data.loc[data['target'] == 0].sample(5)

Taking a look at Sincere Questions


Unnamed: 0,qid,question_text,target
1071083,d1e0aac78b3c77ae57bf,"Why do I feel lost, depressed and sad after qu...",0
570365,6fc1a1d440e6fc0db3bd,What should you talk about on a first date wit...,0
402588,4ee3366051d1acf54aa1,Can you lose weight by eating nothing but rame...,0
337586,4224c3a11566b27fb678,What jobs for English speakers are needed in M...,0
1084214,d479c52b797ebd21de2b,Why do people get hysteria over global warming...,0


In [88]:
print ('Taking a look at Insincere Questions')
data.loc[data['target'] == 1].sample(5)['question_text']

Taking a look at Insincere Questions


169888    Is it time for the US to ditch the EU and Cana...
954514    Is it misogynistic to think that women will ne...
15477     Why do men not Care about height? Do men just ...
293918    Does the USA fit the definition of a bully? E....
344000                  Is the US a jewocracy or democracy?
Name: question_text, dtype: object

In [89]:
#data = data[:5000]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1306122 non-null  object
 1   question_text  1306122 non-null  object
 2   target         1306122 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


## Text Standardization

### Utils

In [90]:
def remove_stop_words(text):
    filtered_stop_words = []
    doc = nlp(text)
    for token in doc:
        if not token.is_stop and not token.is_punct: # we use token attribute .is_stop
            filtered_stop_words.append(token.text)
    return " ".join(filtered_stop_words)
def lemmatized_string(text):
    doc = nlp(text)
    lemmatized_string = []
    for token in doc:
        lemmatized_string.append(token.lemma_)
    return " ".join(lemmatized_string)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [91]:
data.drop(['qid'], axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   question_text  1306122 non-null  object
 1   target         1306122 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 19.9+ MB


### Standardize

In [92]:
data['question_text']= data['question_text'].apply(remove_stop_words)
data['question_text']= data['question_text'].apply(lemmatized_string)
data['question_text']= data['question_text'].apply(remove_URL)
data['question_text']= data['question_text'].apply(remove_html)
data['question_text']= data['question_text'].apply(remove_emoji)

In [93]:
data.sample(5)

Unnamed: 0,question_text,target
1177442,far 35 kilometer coast Hong Kong,0
690449,physically emotionally abuse sociopath,0
925973,south Indians lack patriotism,1
190107,Leia find fun work Jabba slave,1
930270,word man hate woman thing actually exist word ...,1


## Train Test Split

In [94]:
train, test = train_test_split(data, test_size=0.2)

## Naive Bayes

In [95]:
row_count = train.shape[0]

total_word_count_dict = {}
word_count_sincere = {}
word_count_insincere = {}
sincere  = 0
insincere = 0 


for row in range(0,row_count):
    insincere += train.iloc[row]['target']
    sincere += (1 - train.iloc[row]['target'])
    sentence = train.iloc[row]['question_text']
    words_in_sentence = list(set(sentence.split(' ')))
    for word in words_in_sentence:
        if train.iloc[row]['target'] == 0:   #Sincere Words
            if word in word_count_sincere.keys():
                word_count_sincere[word]+=1
            else:
                word_count_sincere[word] = 1
        elif train.iloc[row]['target'] == 1: #Insincere Words
            if word in word_count_insincere.keys():
                word_count_insincere[word]+=1
            else:
                word_count_insincere[word] = 1
        if word in total_word_count_dict.keys():        #For all words. I use this to compute probability.
            total_word_count_dict[word]+=1
        else:
            total_word_count_dict[word]=1
     

In [96]:
total_word_probability = {}
total_words = 0
for word in total_word_count_dict:
    total_words += total_word_count_dict[word]
for word in total_word_count_dict:
    total_word_probability[word] = total_word_count_dict[word] / total_words

In [75]:
print(total_word_probability)



In [97]:
print ('Total words ',len(total_word_probability))
print ('Minimum probability ',min (total_word_probability.values()))
threshold_p = 0.0001
for word in list(total_word_probability):
    if total_word_probability[word] < threshold_p:
        del total_word_probability[word]
        if word in list(word_count_sincere):   #list(dict) return it;s key elements
            del word_count_sincere[word]
        if word in list(word_count_insincere):  
            del word_count_insincere[word]
print ('Total words ',len(total_word_probability))

Total words  210910
Minimum probability  1.6672671607223869e-07
Total words  1734


In [98]:
total_sincere_words = sum(word_count_sincere.values())
cp_sincere = {}  #Conditional Probability of the word given sentence is sincere
for i in list(word_count_sincere):
    cp_sincere[i] = word_count_sincere[i] / total_sincere_words

total_insincere_words = sum(word_count_insincere.values())
cp_insincere = {}  #Conditional Probability of the word given sentence is insincere
for i in list(word_count_insincere):
    cp_insincere[i] = word_count_insincere[i] / total_insincere_words

In [99]:
row_count = test.shape[0]
print(row_count)
#p(y = "insincere" | X = [word1, word2, ..., wordn]) = p(word1 | y = "insincere").p(word2 | y = "insincere")...p(wordn | y = "insincere"). p(y="insincere")
#p(y = "sincere" | X = [word1, word2, ..., wordn]) = p(word1 | y = "sincere").p(word2 | y = "sincere")...p(wordn | y = "sincere"). p(y="sincere")
p_insincere = insincere / (sincere + insincere)
p_sincere = sincere / (sincere + insincere)
accuracy = 0

for row in range(0,row_count):
    sentence = test.iloc[row]['question_text']
    target = test.iloc[row]['target']
    words_in_sentence = list(set(sentence.split(' ')))
    insincere_term = p_insincere
    sincere_term = p_sincere
    
    sincere_M = len(cp_sincere.keys())
    insincere_M = len(cp_insincere.keys())
    for word in words_in_sentence:
        if word not in cp_insincere.keys():
            insincere_M +=1
        if word not in cp_sincere.keys():
            sincere_M += 1
         
    for word in words_in_sentence:
        if word in cp_insincere.keys():
            insincere_term *= (cp_insincere[word] + (1/insincere_M))
        else:
            insincere_term *= (1/insincere_M)
        if word in cp_sincere.keys():
            sincere_term *= (cp_sincere[word] + (1/sincere_M))
        else:
            sincere_term *= (1/sincere_M)
        
    if insincere_term/(insincere_term + sincere_term) > 0.5:
        response = 1
    else:
        response = 0
    if target == response:
        accuracy += 1
    
print ('Accuracy is ',accuracy/row_count*100)

261225
Accuracy is  94.25169872715092
