In [121]:
import os
import re
import string

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
from nltk.corpus import stopwords

In [122]:
!ls ../input/

print("\nEmbeddings:")
!ls ../input/embeddings/

embeddings  sample_submission.csv  test.csv  train.csv

Embeddings:
GoogleNews-vectors-negative300	paragram_300_sl999
glove.840B.300d			wiki-news-300d-1M


### Embeddings

* GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
* glove.840B.300d - https://nlp.stanford.edu/projects/glove/
* paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
* wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html

In [123]:
print('File sizes')
for f in os.listdir('../input'):
    if 'zip' not in f:
        print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')

File sizes
embeddings                    0.0MB
train.csv                     124.21MB
sample_submission.csv         1.3MB
test.csv                      5.24MB


## Open trainset and testset

In [124]:
train = pd.read_csv('../input/train.csv').fillna(' ')
test = pd.read_csv('../input/test.csv').fillna(' ')

In [125]:
print("Shape of training set: ", train.shape)
print("Shape of test set: ", test.shape)

train_target = train['target'].values
np.unique(train_target)
print("\nPercentage of insincere questions irt sincere questions: ", train_target.mean(), "%")

Shape of training set:  (1306122, 3)
Shape of test set:  (56370, 2)

Percentage of insincere questions irt sincere questions:  0.06187017751787352 %


In [126]:
train.sample(10)

Unnamed: 0,qid,question_text,target
389534,4c4d6e8493a31d716982,What is it in the psyche of those trump suppor...,1
587338,730ff7afe32ab58b6f76,Do you feel that this country has a good leade...,0
806507,9e06a27c8a3faebec111,What are the certification courses available i...,0
14448,02d77c5461f6e0038d84,Why should we consume protein when we wake up?,0
922157,b4b33551e461d05964e4,Is touching the interaction of electric fields...,0
545974,6af5ff7ff384af217886,What's the point for a cake shop to have 'sale...,0
1245158,f403535631b8b8c102ca,I am 12. Is it considered toxic if my parent y...,0
468409,5bba65fd4720f44c08e1,If I ask my brother to help me move my heavy f...,0
1154072,e22127b57aed745e6d76,Can you describe the three phases of post-affa...,0
1011298,c62b0db4eb48496388be,How do I get the latest technology especially ...,0


In [127]:
test.sample(10)

Unnamed: 0,qid,question_text
22417,656be261fce128fc33cf,Which is the best smartphone between Moto and ...
11098,32b702c0bcd87930a503,Do gun control advocates usually do not unders...
33641,97a0d5e2bf5e57f9e1a7,What are some psychological triggers that caus...
4765,15cc947dd40470ce4afc,Are Chinese software developers become dominan...
10274,2f2e3952fbeed9e8aed3,What will happen we drink dog’s milk?
9387,2b40a650c6d7151abb86,Are girls allowed in boys hostel at PICT Pune ...
5012,16fcc807311628b570b2,What are my chances of getting a fully-funded ...
363,01c14e356cbb82800f60,Which entity does love Jerusalem more? Israel ...
34761,9caa0548b60aff101e45,Can you suggest a name for my personal blog?
49849,e1ff3fe2cf10730ca4db,What is an adduct? How is stability of adduct ...


In [128]:
insincere_q = train[train["target"] == 1]["question_text"].tolist()

with open('insinceres.txt', 'w') as f:
    for item in insincere_q:
        f.write("%s\n" % item)

## N-gram analysis

#### 1-gram analysis

In [129]:
from collections import defaultdict
from wordcloud import STOPWORDS
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

insinc_df = train[train.target==1]
sinc_df = train[train.target==0]

def plot_ngrams(n_grams):

    ## custom function for ngram generation ##
    def generate_ngrams(text, n_gram=1):
        token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
        ngrams = zip(*[token[i:] for i in range(n_gram)])
        return [" ".join(ngram) for ngram in ngrams]

    ## custom function for horizontal bar chart ##
    def horizontal_bar_chart(df, color):
        trace = go.Bar(
            y=df["word"].values[::-1],
            x=df["wordcount"].values[::-1],
            showlegend=False,
            orientation = 'h',
            marker=dict(
                color=color,
            ),
        )
        return trace

    def get_bar(df, bar_color):
        freq_dict = defaultdict(int)
        for sent in df["question_text"]:
            for word in generate_ngrams(sent, n_grams):
                freq_dict[word] += 1
        fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
        fd_sorted.columns = ["word", "wordcount"]
        trace = horizontal_bar_chart(fd_sorted.head(10), bar_color)
        return trace    

    trace0 = get_bar(sinc_df, 'blue')
    trace1 = get_bar(insinc_df, 'blue')

    # Creating two subplots
    if n_grams == 1:
        wrd = "words"
    elif n_grams == 2:
        wrd = "bigrams"
    elif n_grams == 3:
        wrd = "trigrams"
    
    fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                              subplot_titles=["Frequent " + wrd + " of sincere questions", 
                                              "Frequent " + wrd + " of insincere questions"])
    fig.append_trace(trace0, 1, 1)
    fig.append_trace(trace1, 1, 2)
    fig['layout'].update(height=500, width=1150, paper_bgcolor='rgb(233,233,233)', title=wrd + " Count Plots")
    py.iplot(fig, filename='word-plots')


In [130]:
plot_ngrams(1)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [131]:
plot_ngrams(2)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [None]:
plot_ngrams(3)

## EDA observations

### 1. Mean word length is quite different
mean_word_len in train 57.6  
mean_word_len in test 29.3

### 2. Num words, testset has more variance
![image.png](attachment:image.png)

### 3. [Quora source](https://www.quora.com/What-is-an-insincere-question)

Possible words in insincere questions:  
* why (maybe also what,how)
* so
* such
* finally
* than

# Resources from other competitions

[Discussion post](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/70788) (see comments and most upvoted kernels from the competitions) 

* [Toxic comment classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)  (see most upvoted kernels)
* [Example of rheoterical questions](http://examples.yourdictionary.com/rhetorical-question-examples.html)

## Kernels

#### Introductory EDA
* https://www.kaggle.com/thebrownviking20/analyzing-quora-for-the-insinceres
* [Very general EDA](https://www.kaggle.com/mjbahmani/a-data-science-framework-for-quora) already went through it
* [A look at different embeddings](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings)
* [LSTM and embeddings](https://www.kaggle.com/mihaskalic/lstm-is-all-you-need-well-maybe-embeddings-also)
* [Toxic comments EDA](https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda)
* [Toxic using keras](https://www.kaggle.com/sbongo/for-beginners-tackling-toxic-using-keras)
* [Do pretrained embeddings help?](https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge)
* [Miscellaneous helping material](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/71361) (more in comments)
* [Preprocess when using embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings)
* [EDA for another Quora competition](https://www.kaggle.com/philschmidt/quora-eda-model-selection-roc-pr-plots)
* [EDA from indian (very well explained)](https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-qiqc) went through it
* [EDA and LSTM-CNN](https://www.kaggle.com/artgor/eda-and-lstm-cnn)
* [Simple EDA](https://www.kaggle.com/tunguz/just-some-simple-eda) went through it


#### Other
* [Augmentation for text](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/71083)
* [Papers for text classification](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/70821)
* [Strange sentences marked as insincere](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/70956)
* [General advice on kaggle competitions](https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70908)
* [Embeddings with attention](https://www.kaggle.com/shujian/different-embeddings-with-attention-fork-fork)
* [Text pre-processing techniques](https://www.kaggle.com/deffro/text-pre-processing-techniques)
* [Importance of cleaning text](https://www.kaggle.com/currie32/the-importance-of-cleaning-text)
* [Blendingis all you need](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/72627)
* [Text preprocessing](https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing)
* [Methods to combine embeddings](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/71778)
* [Material for beginners, toxic comment classification](https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/71361)
* [Analyzing Quora for insinceres](https://www.kaggle.com/thebrownviking20/analyzing-quora-for-the-insinceres)

## Create hyperparameters

* Average length of the words

In [None]:
## https://www.kaggle.com/tunguz/just-some-simple-eda
eng_stopwords = set(stopwords.words("english"))

## Average length of the words in the text 
train["mean_word_len"] = train["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len"] = test["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
print(train.columns)
train.head()

In [None]:
#Remove bad symbols and stopwords from test and train data
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower()   # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(" ", text)     # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub("", text)     # delete symbols which are in BAD_SYMBOLS_RE from text
    
    
    resultwords = [word for word in text.split() if word not in STOPWORDS]  # delete stopwords from text
    text = ' '.join(resultwords)
    
    return text