In [1]:
import numpy as np # linear algebra
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import pandas as pd
from tqdm import tqdm 
import time
#tqdm is used to create progress bars
tqdm.pandas()

In [3]:
%%time
train = pd.read_csv("../input/quora-insincere-questions-classification/train.csv",index_col='qid')
test = pd.read_csv("../input/quora-insincere-questions-classification/test.csv",index_col='qid')
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

In [4]:
train['target'].value_counts()

In [5]:
train[train['target']==0].shape

# Creating a Shorter Training data

In [6]:
pos_df = train[train['target']==0].sample(frac=0.03)
neg_df = train[train['target']==1].sample(frac=0.4)

In [7]:
train_df = pd.concat([pos_df,neg_df])
train_df['target'].value_counts()

In [8]:
que_eg = train_df.iloc[:1,0]
sentences = que_eg.item().split()
sentences

In [9]:
train_df.sample(10)

In [10]:
train_df, val_df = \
              np.split(train_df.sample(frac=1, random_state=42), 
                       [int(.8*len(train_df))])

train_df.shape , val_df.shape

In [11]:
import re
def cleanText(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) # remove non-ascii characters
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    resumeText = resumeText.lower() 
    return resumeText

# One Hot Encoding 

In [12]:
all_questns = [cleanText(sent) for sent in train_df.question_text.to_list()] # a list of all the quetions in the train data
all_questns[:5]

In [13]:
import tensorflow
from tensorflow.keras.preprocessing.text import one_hot
#vocab 
max_vocab = 25000
#one hot representation

onehot_que = [one_hot(questns,max_vocab) for questns in all_questns]

Take a look at the emmbedings

In [14]:
pd.DataFrame({'Words' : [word for word in all_questns[0].split()], 'Encoding' : onehot_que[0]})

# Word Embedding

Before we embed, we need some parameters defined, 
1. Maximum length of the sentence
2. Number of features to use in embedding

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max([len(sent.split()) for sent in all_questns])
max_len 

#padding the setences 
padded_que = pad_sequences(onehot_que,padding='post',maxlen=max_len)
padded_que

## Creating a Pipeline for data processing
So basically all the preprocessing steps that we did in the above cells, we'll create a function just for that.

In [16]:
def preprocessing(df,y=True , max_vocab=25000 ,max_len=64, ): 
    sentences = df.question_text.to_list()
    sentences = [cleanText(sent) for sent in sentences]
    max_vocab = 25000
#one hot representation
    X = [one_hot(questns,max_vocab) for questns in sentences]
    X = pad_sequences(X, padding='post' , maxlen=max_len)
    if y :
        target = np.array(df.target)
        return X, target
    else : 
        return X

# Preparing The Features and Targets

In [46]:
test_inputs = preprocessing(test,y=False)
test_inputs.shape

In [17]:
X_train , y_train = preprocessing(train_df)
X_val , y_val = preprocessing(val_df)
dim=10
X_train.shape , X_val.shape ,y_train.shape ,y_val.shape

In [18]:
from tensorflow.keras.layers import Embedding,LSTM, Dense,Dropout
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Embedding(max_vocab,dim, input_length=max_len))
model.add(Dense(512 , activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(512 , activation='relu'))
# #model.add(Dropout(0.2))
# model.add(Dense(512 , activation='relu'))
model.add(Dense(512 , activation='relu'))
model.add(Dense(256 , activation='relu'))
model.add(Dense(512 , activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile('adam',['binary_crossentropy'])

model.summary()

In [19]:
%%time
history = model.fit(X_train,y_train,validation_data=(X_val,y_val) , epochs=10,batch_size=150)

In [20]:
history.history

In [21]:
history_df = pd.DataFrame(history.history)
history_df.loc[: , ['loss' , 'val_loss']].plot()

In [23]:
%%time 
from sklearn.linear_model import LogisticRegression
model1 = {
    'LogisticRegression' : {
        'model' : LogisticRegression(solver='liblinear' ),
        'params': {
            'C': [1,5,10,50]
        }
    }
}

from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model1.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=4, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

In [39]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=1, solver='sag')
logreg.fit(X_train, y_train)
# Y_pred = logreg.predict(test_inputs)
acc_log = round(logreg.score(X_val, y_val) * 100, 2)
print(acc_log)

In [53]:
%%time
from sklearn.ensemble.forest import RandomForestClassifier
params = [100,200,500,1000]

score = []
for estimator in params : 
    random_forest = RandomForestClassifier(n_estimators=estimator,max_depth=50 , random_state=1)
    random_forest.fit(X_train, y_train)
    train_acc = round(random_forest.score(X_train, y_train) * 100, 2)
    val_acc = round(random_forest.score(X_val, y_val) * 100, 2)
    score.append({
        'n_estimator': estimator,
        'train_accuracy' : train_acc ,
        'val_accuracy' : val_acc
    })


In [54]:
randomFroest_df = pd.DataFrame(score , columns=['n_estimator' , 'train_accuracy','val_accuracy'])

In [55]:
Y_pred = random_forest.predict(test_inputs)
Y_pred

In [56]:
submission = pd.DataFrame({'qid': test.index,'prediction': Y_pred} )
submission.head()
submission.to_csv('submission.csv',index=False)

In [59]:
randomFroest_df