In [None]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import plot_confusion_matrix,classification_report
from sklearn.metrics import jaccard_score,f1_score,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import time
import datetime
import re
stop_words = stopwords.words('english')

df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
ps = PorterStemmer()
lem = WordNetLemmatizer()
def processText(X,do_lem=False,do_stem=False):
    
    X_tmp = X.copy()
    for i in X_tmp.index:
        
        text = X_tmp[i]
        text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
        lst_text = text.split()
        
        if do_stem:
            lst_text = [ps.stem(word) for word in lst_text]
        
        if do_lem:
            lst_text = [lem.lemmatize(word) for word in lst_text]
            
        text = " ".join(lst_text)
        
        X_tmp[i] = text
    
    return X_tmp

In [None]:
param_list = []
ngram_list = []
for i in range(1,5):
    for j in range(i+2,5):
        ngram_list.append((i,j))

kernel_list = ['linear','rbf','poly','sigmoid']
lem_list = [True,False]
stem_list = [True,False]

for n in ngram_list:
    for k in kernel_list:
        for l in lem_list:
            for s in stem_list:
                params = {'ngram_range':n,
                          'kernel':k,
                          'lem':l,
                          'stem':s}
                param_list.append(params)
                
#best parameter set
param_list = []
params = {'ngram_range':(1,3),
          'kernel':'linear',
          'lem':True,
          'stem':False}
param_list.append(params)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df_train['text'],df_train['target'],test_size=0.1,random_state=0)

In [None]:
def score_model(p):
    vectorizer = TfidfVectorizer(stop_words=stop_words,ngram_range=p['ngram_range'])
    clf = CalibratedClassifierCV(SVC(kernel=p['kernel']),cv=5)

    model = Pipeline([('vectorizer',vectorizer),('classifier',clf)])
    model.fit(processText(x_train,do_lem=p['lem'],do_stem=p['stem']),y_train)
    jaccard = jaccard_score(y_test,model.predict(processText(x_test,do_lem=p['lem'],do_stem=p['stem']))) 
    f1 = f1_score(y_test,model.predict(processText(x_test,do_lem=p['lem'],do_stem=p['stem'])))
    confusion = confusion_matrix(y_test,model.predict(processText(x_test,do_lem=p['lem'],do_stem=p['stem'])))
    TP = confusion[1][1]/sum(confusion[1][:])
    TN = confusion[0][0]/sum(confusion[0][:])
    score = {'jaccard':jaccard,
             'f1':f1,
             'TP':TP,
             'TN':TN,
             'kernel':p['kernel'],
             'ngram_range':p['ngram_range'],
             'lem':p['lem'],
             'stem':p['stem']}
    return score          

In [None]:
df_scores = pd.DataFrame()

for i,p in enumerate(param_list):
    
    start = time.time()
    df_scores = df_scores.append(score_model(p),ignore_index=True)
    end = time.time()
    elapsed = end - start
    
    remaining_seconds = elapsed*(len(param_list)-i-1)
    print("{:.5f} done, {} remaining".format((i+1)/len(param_list),str(datetime.timedelta(seconds=remaining_seconds))),end="\r")

In [None]:
df_scores['avg score'] = df_scores[['jaccard','f1','TP','TN']].values.mean(axis=1)

In [None]:
df_scores.sort_values(by='avg score',ascending=False,inplace=True)
df_scores.head()

In [None]:
df_scores.iloc[0]

In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words,ngram_range=df_scores.iloc[0]['ngram_range'])
clf = CalibratedClassifierCV(SVC(kernel=df_scores.iloc[0]['kernel']),cv=5)

model = Pipeline([('vectorizer',vectorizer),('classifier',clf)])
model.fit(processText(x_train,do_lem=df_scores.iloc[0]['lem'],do_stem=df_scores.iloc[0]['stem']),y_train)

In [None]:
x_test = processText(x_test,do_lem=df_scores.iloc[0]['lem'],do_stem=df_scores.iloc[0]['stem'])

In [None]:
y_discrete_preds = model.predict(x_test)
y_preds = model.predict_proba(x_test)

In [None]:
plot_confusion_matrix(model,x_test,y_test)

In [None]:
print(classification_report(y_test,y_discrete_preds))

In [None]:
df_submit = pd.DataFrame(columns=['id','text','target'])
df_submit['id'] = df_test['id']
df_submit['text'] = df_test['text']
df_submit['target'] = model.predict(processText(df_test['text'],do_lem=df_scores.iloc[0]['lem'],do_stem=df_scores.iloc[0]['stem']))

print(df_submit['target'].value_counts())
df_submit

In [None]:
df_submit[['id','target']].to_csv('submission.csv',index=False)